import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sample_data
import statsmodels.api as sm
from matplotlib.ticker import PercentFormatter
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
# load the data into a pandas dataframe
data_path = '/content/drive/MyDrive/organics.csv'
df = pd.read_csv(data_path)
# preview data
df.info()
df.head
print(df)
# total count of NaN values
print(df.isnull().sum())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 22223 non-null int64
1 DemAffl 21138 non-null float64
2 DemAge 20715 non-null float64
3 DemCluster 21549 non-null float64
4 DemClusterGroup 21549 non-null object
5 DemGender 19711 non-null object
6 DemReg 21758 non-null object
7 DemTVReg 21758 non-null object
8 PromClass 22223 non-null object
9 PromSpend 22223 non-null float64
10 PromTime 21942 non-null float64
11 TargetBuy 22223 non-null int64
12 TargetAmt 22223 non-null int64
dtypes: float64(5), int64(3), object(5)
memory usage: 2.2+ MB
ID DemAffl DemAge DemCluster DemClusterGroup DemGender \
0 140 10.0 76.0 16.0 C U
1 620 4.0 49.0 35.0 D U
2 868 5.0 70.0 27.0 D F
3 1120 10.0 65.0 51.0 F M
4 2313 11.0 68.0 4.0 A F
... ... ... ... ... ... ...
22218 52834058 13.0 65.0 9.0 B F
22219 52834376 15.0 73.0 34.0 D U
22220 52837057 9.0 70.0 15.0 B F
22221 52838096 11.0 66.0 8.0 B F
22222 52856469 3.0 75.0 53.0 F NaN
DemReg DemTVReg PromClass PromSpend PromTime TargetBuy \
0 Midlands Wales & West Gold 16000.00 4.0 0
1 Midlands Wales & West Gold 6000.00 5.0 0
2 Midlands Wales & West Silver 0.02 8.0 1
3 Midlands Midlands Tin 0.01 7.0 1
4 Midlands Midlands Tin 0.01 8.0 0
... ... ... ... ... ... ...
22218 South East London Silver 1500.00 5.0 0
22219 South East S & S East Gold 6053.06 12.0 0
22220 North Yorkshire Gold 6000.00 5.0 0
22221 North N West Silver 5000.00 5.0 0
22222 South West S West Gold 6000.00 2.0 0
TargetAmt
0 0
1 0
2 1
3 1
4 0
... ...
22218 0
22219 0
22220 0
22221 0
22222 0
[22223 rows x 13 columns]
ID 0
DemAffl 1085
DemAge 1508
DemCluster 674
DemClusterGroup 674
DemGender 2512
DemReg 465
DemTVReg 465
PromClass 0
PromSpend 0
PromTime 281
TargetBuy 0
TargetAmt 0
dtype: int64
#Replace missing values in gender with unspecified which is a category that
#already exists in this data
df['DemGender'].fillna('U', inplace=True)
df.dropna(subset='DemAge', inplace=True)
print(df.isnull().sum())
ID 0 DemAffl 1001 DemAge 0 DemCluster 620 DemClusterGroup 620 DemGender 0 DemReg 432 DemTVReg 432 PromClass 0 PromSpend 0 PromTime 259 TargetBuy 0 TargetAmt 0 dtype: int64
#Drop customers with missing values in the affluence grade category
#See how this affects missing values in the other categories
df.dropna(subset='DemAffl', inplace=True)
print(df.isnull().sum())
ID 0 DemAffl 0 DemAge 0 DemCluster 585 DemClusterGroup 585 DemGender 0 DemReg 411 DemTVReg 411 PromClass 0 PromSpend 0 PromTime 243 TargetBuy 0 TargetAmt 0 dtype: int64
#Drop customers with missing values in the residental neigborhood type category
#See how this affects missing values in the other categories
df.dropna(subset='DemCluster', inplace=True)
print(df.isnull().sum())
ID 0 DemAffl 0 DemAge 0 DemCluster 0 DemClusterGroup 0 DemGender 0 DemReg 399 DemTVReg 399 PromClass 0 PromSpend 0 PromTime 225 TargetBuy 0 TargetAmt 0 dtype: int64
#See how many categories are in the geographic and television region to
#see if they would be detrimental to include in machine learning models
#This affects whether we should drop customers with data missing in
#these columns
print(df['DemReg'].nunique())
print(df['DemTVReg'].nunique())
5 12
#Drop customers with missing values in the geographic region category
#See how this affects missing values in the other categories
df.dropna(subset='DemReg', inplace=True)
print(df.isnull().sum())
ID 0 DemAffl 0 DemAge 0 DemCluster 0 DemClusterGroup 0 DemGender 0 DemReg 0 DemTVReg 0 PromClass 0 PromSpend 0 PromTime 220 TargetBuy 0 TargetAmt 0 dtype: int64
#Drop customers with missing values in the category about how much time they
#were a loyalty member for
#See how this affects missing values in the other categories
df.dropna(subset='PromTime', inplace=True)
print(df.isnull().sum())
ID 0 DemAffl 0 DemAge 0 DemCluster 0 DemClusterGroup 0 DemGender 0 DemReg 0 DemTVReg 0 PromClass 0 PromSpend 0 PromTime 0 TargetBuy 0 TargetAmt 0 dtype: int64
#Make categorical columns into numerical dummies where each category is replaced
#by a number starting from 0
def dummyize(x, columns):
for column in range(0,len(columns)):
if x == columns[column]:
return column
#Make the categorical columns into dummies using the dummyize function so they can be used in a decision tree and logistic regression models
df["DemGender"] = df["DemGender"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemGender']).columns))
df["DemClusterGroup"] = df["DemClusterGroup"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemClusterGroup']).columns))
df["DemReg"] = df["DemReg"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemReg']).columns))
df["PromClass"] = df["PromClass"].apply(lambda x: dummyize(x, pd.get_dummies(df['PromClass']).columns))
#Drop columns that would be detrimental to machine learning
#ID is the unique identificator for each customer
#TargetAmt gives away if a customer brought organic products
#DemTVReg has too many categories
df.drop(['DemTVReg', 'TargetAmt', 'ID'], axis=1, inplace=True)
#Make TargetBuy of type category so it can be used in training models
df['TargetBuy'] = df['TargetBuy'].astype('category')
#Split the data on what the model is learning to predict, whether the customer
#brought organic products from the rest of the data
X = df.drop('TargetBuy', axis=1)
y = df['TargetBuy']
#Split the data into training and test sets to be able to train and compare models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)
X_train.info()
y_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 12957 entries, 779 to 18057 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DemAffl 12957 non-null float64 1 DemAge 12957 non-null float64 2 DemCluster 12957 non-null float64 3 DemClusterGroup 12957 non-null int64 4 DemGender 12957 non-null int64 5 DemReg 12957 non-null int64 6 PromClass 12957 non-null int64 7 PromSpend 12957 non-null float64 8 PromTime 12957 non-null float64 dtypes: float64(5), int64(4) memory usage: 1012.3 KB <class 'pandas.core.series.Series'> Int64Index: 12957 entries, 779 to 18057 Series name: TargetBuy Non-Null Count Dtype -------------- ----- 12957 non-null category dtypes: category(1) memory usage: 114.0 KB
# create an instance of a decision tree classifier using default values
dt = DecisionTreeClassifier(max_depth = 25, min_samples_leaf=10, ccp_alpha = 0.001)
# fit the model to the training data
dt.fit(X_train, y_train)
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_default")
'decision_tree_default.pdf'
#Show the visualization of the decision tree in this notebook
export_graphviz(dt,
'tree.dot',
class_names=['0','1'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
<matplotlib.image.AxesImage at 0x7ff4593df2b0>
# make predictions on the training and test data
y_pred_train = dt.predict(X_train)
y_pred_test = dt.predict(X_test)
y_prob_train = dt.predict_proba(X_train)
y_prob_test = dt.predict_proba(X_test)
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall. : {:.4f}".format(rec_train))
print("")
# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)
# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall. : {:.4f}".format(rec_test))
-- train set -- Accuracy : 0.8210 Precision: 0.7204 Recall. : 0.4346 -- test set -- Accuracy : 0.8131 Precision: 0.6944 Recall. : 0.4138
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[9265 1786] [ 533 1373]] [[3956 792] [ 246 559]]
# Generate ROC curve for training data
fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
#calculate feature importance
tree_imp = dt.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df = pd.DataFrame(data = tree_imp,
columns = column_values)
df['feature']=X.columns
#sort data so features with largest importance values are at the top
df2 = df.sort_values(by=['importance'], ascending=False)
print(df2)
#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df2,
y=df2['feature'],
x=df2['importance'], color="lightblue")
plt.show()
importance feature 1 0.506989 DemAge 0 0.341903 DemAffl 4 0.151108 DemGender 2 0.000000 DemCluster 3 0.000000 DemClusterGroup 5 0.000000 DemReg 6 0.000000 PromClass 7 0.000000 PromSpend 8 0.000000 PromTime
#Use a grid search with a decision tree to determine which parameters obatin the
#best scores on the training set so we have "tuned" parameters or values
dt_tune = DecisionTreeClassifier()
param_grid = {
'max_depth': [None, 5, 10, 15, 20, 25],
'min_samples_leaf': [1, 10, 20, 50, 100],
'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}
grid_search = GridSearchCV(dt_tune, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
print(best_params)
print(best_estimator)
{'ccp_alpha': 0, 'max_depth': 5, 'min_samples_leaf': 10}
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=10)
# create an instance of a decision tree classifier using "tuned" values
dt_tuned = DecisionTreeClassifier(max_depth = 25, min_samples_leaf=10, ccp_alpha = 0)
# fit the model to the training data
dt_tuned.fit(X_train, y_train)
DecisionTreeClassifier(ccp_alpha=0, max_depth=25, min_samples_leaf=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(ccp_alpha=0, max_depth=25, min_samples_leaf=10)
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_tuned, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_tuned")
'decision_tree_tuned.pdf'
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_tuned,
'tree.dot',
class_names=['0','1'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.721089 to fit
<matplotlib.image.AxesImage at 0x7ff45e445600>
# make predictions on the training and test data
y_pred_train = dt_tuned.predict(X_train)
y_pred_test = dt_tuned.predict(X_test)
y_prob_train = dt_tuned.predict_proba(X_train)
y_prob_test = dt_tuned.predict_proba(X_test)
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall. : {:.4f}".format(rec_train))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall. : {:.4f}".format(rec_test))
-- train set -- Accuracy : 0.8567 Precision: 0.7763 Recall. : 0.5790 -- test set -- Accuracy : 0.7936 Precision: 0.6060 Recall. : 0.4338
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[9271 1330] [ 527 1829]] [[3821 765] [ 381 586]]
# Generate ROC curve for training data
fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
#calculate feature importance
tree_imp = dt_tuned.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df = pd.DataFrame(data = tree_imp,
columns = column_values)
df['feature']=X.columns
#sort data so features with largest importance values are at the top
df2 = df.sort_values(by=['importance'], ascending=False)
print(df2)
#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df2,
y=df2['feature'],
x=df2['importance'], color="lightblue")
plt.show()
importance feature 1 0.394919 DemAge 0 0.260241 DemAffl 4 0.105844 DemGender 2 0.084088 DemCluster 7 0.063271 PromSpend 8 0.049167 PromTime 5 0.029433 DemReg 6 0.007883 PromClass 3 0.005154 DemClusterGroup
#Reload the data to include data from customers who were dropped for missing
#data in categories that were not used in the previous tree models
df = pd.read_csv(data_path)
# preview data
df.info()
df.head
print(df)
# total count of NaN values
print(df.isnull().sum())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 22223 non-null int64
1 DemAffl 21138 non-null float64
2 DemAge 20715 non-null float64
3 DemCluster 21549 non-null float64
4 DemClusterGroup 21549 non-null object
5 DemGender 19711 non-null object
6 DemReg 21758 non-null object
7 DemTVReg 21758 non-null object
8 PromClass 22223 non-null object
9 PromSpend 22223 non-null float64
10 PromTime 21942 non-null float64
11 TargetBuy 22223 non-null int64
12 TargetAmt 22223 non-null int64
dtypes: float64(5), int64(3), object(5)
memory usage: 2.2+ MB
ID DemAffl DemAge DemCluster DemClusterGroup DemGender \
0 140 10.0 76.0 16.0 C U
1 620 4.0 49.0 35.0 D U
2 868 5.0 70.0 27.0 D F
3 1120 10.0 65.0 51.0 F M
4 2313 11.0 68.0 4.0 A F
... ... ... ... ... ... ...
22218 52834058 13.0 65.0 9.0 B F
22219 52834376 15.0 73.0 34.0 D U
22220 52837057 9.0 70.0 15.0 B F
22221 52838096 11.0 66.0 8.0 B F
22222 52856469 3.0 75.0 53.0 F NaN
DemReg DemTVReg PromClass PromSpend PromTime TargetBuy \
0 Midlands Wales & West Gold 16000.00 4.0 0
1 Midlands Wales & West Gold 6000.00 5.0 0
2 Midlands Wales & West Silver 0.02 8.0 1
3 Midlands Midlands Tin 0.01 7.0 1
4 Midlands Midlands Tin 0.01 8.0 0
... ... ... ... ... ... ...
22218 South East London Silver 1500.00 5.0 0
22219 South East S & S East Gold 6053.06 12.0 0
22220 North Yorkshire Gold 6000.00 5.0 0
22221 North N West Silver 5000.00 5.0 0
22222 South West S West Gold 6000.00 2.0 0
TargetAmt
0 0
1 0
2 1
3 1
4 0
... ...
22218 0
22219 0
22220 0
22221 0
22222 0
[22223 rows x 13 columns]
ID 0
DemAffl 1085
DemAge 1508
DemCluster 674
DemClusterGroup 674
DemGender 2512
DemReg 465
DemTVReg 465
PromClass 0
PromSpend 0
PromTime 281
TargetBuy 0
TargetAmt 0
dtype: int64
#Replace missing values in gender with unspecified which is a category that
#already exists in this data
df['DemGender'].fillna('U', inplace=True)
df.dropna(subset=['DemAge', 'DemAffl'], inplace=True)
print(df.isnull().sum())
ID 0 DemAffl 0 DemAge 0 DemCluster 585 DemClusterGroup 585 DemGender 0 DemReg 411 DemTVReg 411 PromClass 0 PromSpend 0 PromTime 243 TargetBuy 0 TargetAmt 0 dtype: int64
#Drop the columns that were not used in the previous models so we can use the
#data from customers with values missing in those columns
#
#Drop columns that would be detrimental to machine learning
#ID is the unique identificator for each customer
#TargetAmt gives away if a customer brought organic products
#DemTVReg has too many categories
df.drop(['DemCluster', 'DemClusterGroup', 'DemReg', 'DemTVReg', 'PromTime', 'ID', 'TargetAmt'], axis=1, inplace=True)
print(df.isnull().sum())
DemAffl 0 DemAge 0 DemGender 0 PromClass 0 PromSpend 0 TargetBuy 0 dtype: int64
#Make the categorical columns into dummies using the dummyize function so they can be used in a decision tree and logistic regression models
df["DemGender"] = df["DemGender"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemGender']).columns))
df["PromClass"] = df["PromClass"].apply(lambda x: dummyize(x, pd.get_dummies(df['PromClass']).columns))
#Make TargetBuy of type category so it can be used in training models
df['TargetBuy'] = df['TargetBuy'].astype('category')
#Split the data on what the model is learning to predict, whether the customer
#brought organic products from the rest of the data
X = df.drop('TargetBuy', axis=1)
y = df['TargetBuy']
#Split the data into training and test sets to be able to train and compare models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)
X_train.info()
y_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 13799 entries, 9631 to 16966 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DemAffl 13799 non-null float64 1 DemAge 13799 non-null float64 2 DemGender 13799 non-null int64 3 PromClass 13799 non-null int64 4 PromSpend 13799 non-null float64 dtypes: float64(3), int64(2) memory usage: 646.8 KB <class 'pandas.core.series.Series'> Int64Index: 13799 entries, 9631 to 16966 Series name: TargetBuy Non-Null Count Dtype -------------- ----- 13799 non-null category dtypes: category(1) memory usage: 121.4 KB
# create an instance of a decision tree classifier using default values
dt_more_data = DecisionTreeClassifier(max_depth = 25, min_samples_leaf=10, ccp_alpha = 0.001)
# fit the model to the training data
dt_more_data.fit(X_train, y_train)
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_more_data, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_default_more_data")
'decision_tree_default_more_data.pdf'
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_more_data,
'tree.dot',
class_names=['0','1'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
<matplotlib.image.AxesImage at 0x7ff458e69b10>
# make predictions on the training and test data
y_pred_train = dt_more_data.predict(X_train)
y_pred_test = dt_more_data.predict(X_test)
y_prob_train = dt_more_data.predict_proba(X_train)
y_prob_test = dt_more_data.predict_proba(X_test)
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall. : {:.4f}".format(rec_train))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall. : {:.4f}".format(rec_test))
-- train set -- Accuracy : 0.8147 Precision: 0.6517 Recall. : 0.5543 -- test set -- Accuracy : 0.8213 Precision: 0.6367 Recall. : 0.5631
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[9332 1536] [1021 1910]] [[4073 609] [ 448 785]]
# Generate ROC curve for training data
fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
#calculate feature importance
tree_imp = dt_more_data.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df = pd.DataFrame(data = tree_imp,
columns = column_values)
df['feature']=X.columns
#sort data so features with largest importance values are at the top
df2 = df.sort_values(by=['importance'], ascending=False)
print(df2)
#Create variable importance plot
plt.figure(figsize=(40, 20))
plt.title('Variable Importance', fontsize=18)
plt.xlabel('Importance', fontsize=18)
plt.ylabel('Feature', fontsize=18)
plt.tick_params(axis='x', which='major', labelsize=18)
plt.tick_params(axis='y', which='major', labelsize=25)
sns.barplot(data=df2,
y=df2['feature'],
x=df2['importance'], color="lightblue")
plt.show()
importance feature 1 0.512657 DemAge 0 0.337917 DemAffl 2 0.149426 DemGender 3 0.000000 PromClass 4 0.000000 PromSpend
#Use a grid search with a decision tree to determine which parameters obatin the
#best scores on the new training set so we have "tuned" parameters or values
#See "Default Tree, More Data" for the reloading of the data with more
#customers included
dt_tune = DecisionTreeClassifier()
param_grid = {
'max_depth': [None, 5, 10, 15, 20, 25],
'min_samples_leaf': [1, 10, 20, 50, 100],
'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}
grid_search = GridSearchCV(dt_tune, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
print(best_params)
print(best_estimator)
{'ccp_alpha': 0, 'max_depth': 5, 'min_samples_leaf': 1}
DecisionTreeClassifier(ccp_alpha=0, max_depth=5)
# create an instance of a decision tree classifier using default values
dt_tuned_more_data = DecisionTreeClassifier(max_depth = 5, min_samples_leaf=1, ccp_alpha = 0)
# fit the model to the training data
dt_tuned_more_data.fit(X_train, y_train)
DecisionTreeClassifier(ccp_alpha=0, max_depth=5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(ccp_alpha=0, max_depth=5)
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_tuned_more_data, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_tuned_more_data")
'decision_tree_tuned_more_data.pdf'
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_tuned_more_data,
'tree.dot',
class_names=['0','1'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
<matplotlib.image.AxesImage at 0x7ff458ffff10>
# make predictions on the training and test data
y_pred_train = dt_tuned_more_data.predict(X_train)
y_pred_test = dt_tuned_more_data.predict(X_test)
y_prob_train = dt_tuned_more_data.predict_proba(X_train)
y_prob_test = dt_tuned_more_data.predict_proba(X_test)
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall. : {:.4f}".format(rec_train))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall. : {:.4f}".format(rec_test))
-- train set -- Accuracy : 0.8179 Precision: 0.6671 Recall. : 0.5403 -- test set -- Accuracy : 0.8230 Precision: 0.6482 Recall. : 0.5445
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[9424 1584] [ 929 1862]] [[4109 635] [ 412 759]]
# Generate ROC curve for training data
fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
#calculate feature importance
tree_imp = dt_tuned_more_data.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df = pd.DataFrame(data = tree_imp,
columns = column_values)
df['feature']=X.columns
#sort data so features with largest importance values are at the top
df2 = df.sort_values(by=['importance'], ascending=False)
print(df2)
#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df2,
y=df2['feature'],
x=df2['importance'], color="lightblue")
plt.show()
importance feature 1 0.503338 DemAge 0 0.337206 DemAffl 2 0.158338 DemGender 4 0.001118 PromSpend 3 0.000000 PromClass
# create an instance of a decision tree classifier using default values
dt_tuned_mod_more_data = DecisionTreeClassifier(max_depth = 5, min_samples_leaf=10, ccp_alpha = 0)
# fit the model to the training data
dt_tuned_mod_more_data.fit(X_train, y_train)
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=10)
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_tuned_mod_more_data, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_tuned_more_data_mod")
'decision_tree_tuned_more_data_mod.pdf'
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_tuned_mod_more_data,
'tree.dot',
class_names=['0','1'],
feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
<matplotlib.image.AxesImage at 0x7ff45e1b2290>
# make predictions on the training and test data
y_pred_train = dt_tuned_mod_more_data.predict(X_train)
y_pred_test = dt_tuned_mod_more_data.predict(X_test)
y_prob_train = dt_tuned_mod_more_data.predict_proba(X_train)
y_prob_test = dt_tuned_mod_more_data.predict_proba(X_test)
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall. : {:.4f}".format(rec_train))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall. : {:.4f}".format(rec_test))
-- train set -- Accuracy : 0.8177 Precision: 0.6657 Recall. : 0.5421 -- test set -- Accuracy : 0.8235 Precision: 0.6486 Recall. : 0.5481
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[9415 1578] [ 938 1868]] [[4107 630] [ 414 764]]
# Generate ROC curve for training data
fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
#calculate feature importance
tree_imp = dt_tuned_mod_more_data.feature_importances_
#create a data frame with feature names
# creating a list of column names
column_values = ['importance']
# creating the dataframe
df = pd.DataFrame(data = tree_imp,
columns = column_values)
df['feature']=X.columns
#sort data so features with largest importance values are at the top
df2 = df.sort_values(by=['importance'], ascending=False)
print(df2)
#Create variable importance plot
plt.figure(figsize=(40, 20))
plt.title('Variable Importance', fontsize=18)
plt.xlabel('Importance', fontsize=18)
plt.ylabel('Feature', fontsize=18)
plt.tick_params(axis='x', which='major', labelsize=18)
plt.tick_params(axis='y', which='major', labelsize=25)
sns.barplot(data=df2,
y=df2['feature'],
x=df2['importance'], color="lightblue")
plt.show()
importance feature 1 0.505439 DemAge 0 0.338826 DemAffl 2 0.155735 DemGender 3 0.000000 PromClass 4 0.000000 PromSpend
#Reload the data to include all of the columns except for the ones that were
#determined to be detrimental to machine learning models as it is in the
#original tree models
df = pd.read_csv(data_path)
# preview data
df.info()
df.head
print(df)
# total count of NaN values
print(df.isnull().sum())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 22223 non-null int64
1 DemAffl 21138 non-null float64
2 DemAge 20715 non-null float64
3 DemCluster 21549 non-null float64
4 DemClusterGroup 21549 non-null object
5 DemGender 19711 non-null object
6 DemReg 21758 non-null object
7 DemTVReg 21758 non-null object
8 PromClass 22223 non-null object
9 PromSpend 22223 non-null float64
10 PromTime 21942 non-null float64
11 TargetBuy 22223 non-null int64
12 TargetAmt 22223 non-null int64
dtypes: float64(5), int64(3), object(5)
memory usage: 2.2+ MB
ID DemAffl DemAge DemCluster DemClusterGroup DemGender \
0 140 10.0 76.0 16.0 C U
1 620 4.0 49.0 35.0 D U
2 868 5.0 70.0 27.0 D F
3 1120 10.0 65.0 51.0 F M
4 2313 11.0 68.0 4.0 A F
... ... ... ... ... ... ...
22218 52834058 13.0 65.0 9.0 B F
22219 52834376 15.0 73.0 34.0 D U
22220 52837057 9.0 70.0 15.0 B F
22221 52838096 11.0 66.0 8.0 B F
22222 52856469 3.0 75.0 53.0 F NaN
DemReg DemTVReg PromClass PromSpend PromTime TargetBuy \
0 Midlands Wales & West Gold 16000.00 4.0 0
1 Midlands Wales & West Gold 6000.00 5.0 0
2 Midlands Wales & West Silver 0.02 8.0 1
3 Midlands Midlands Tin 0.01 7.0 1
4 Midlands Midlands Tin 0.01 8.0 0
... ... ... ... ... ... ...
22218 South East London Silver 1500.00 5.0 0
22219 South East S & S East Gold 6053.06 12.0 0
22220 North Yorkshire Gold 6000.00 5.0 0
22221 North N West Silver 5000.00 5.0 0
22222 South West S West Gold 6000.00 2.0 0
TargetAmt
0 0
1 0
2 1
3 1
4 0
... ...
22218 0
22219 0
22220 0
22221 0
22222 0
[22223 rows x 13 columns]
ID 0
DemAffl 1085
DemAge 1508
DemCluster 674
DemClusterGroup 674
DemGender 2512
DemReg 465
DemTVReg 465
PromClass 0
PromSpend 0
PromTime 281
TargetBuy 0
TargetAmt 0
dtype: int64
#Replace missing values in gender with unspecified which is a category that
#already exists in this data
df['DemGender'].fillna('U', inplace=True)
df.dropna(subset='DemAge', inplace=True)
print(df.isnull().sum())
ID 0 DemAffl 1001 DemAge 0 DemCluster 620 DemClusterGroup 620 DemGender 0 DemReg 432 DemTVReg 432 PromClass 0 PromSpend 0 PromTime 259 TargetBuy 0 TargetAmt 0 dtype: int64
#Drop customers with missing values
df.dropna(subset=['DemAffl', 'DemCluster', 'DemReg', 'PromTime'], inplace=True)
print(df.isnull().sum())
ID 0 DemAffl 0 DemAge 0 DemCluster 0 DemClusterGroup 0 DemGender 0 DemReg 0 DemTVReg 0 PromClass 0 PromSpend 0 PromTime 0 TargetBuy 0 TargetAmt 0 dtype: int64
#Make categorical columns into numerical dummies where each category is replaced by a number starting from 0
def dummyize(x, columns):
for column in range(0,len(columns)):
if x == columns[column]:
return column
#Make the categorical columns into dummies using the dummyize function so they can be used in a decision tree and logistic regression models
df["DemGender"] = df["DemGender"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemGender']).columns))
df["DemClusterGroup"] = df["DemClusterGroup"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemClusterGroup']).columns))
df["DemReg"] = df["DemReg"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemReg']).columns))
df["PromClass"] = df["PromClass"].apply(lambda x: dummyize(x, pd.get_dummies(df['PromClass']).columns))
#Drop columns that would be detrimental to machine learning
#ID is the unique identificator for each customer
#TargetAmt gives away if a customer brought organic products
#DemTVReg has too many categories
df.drop(['DemTVReg', 'TargetAmt', 'ID'], axis=1, inplace=True)
#Make TargetBuy of type category so it can be used in training models
df['TargetBuy'] = df['TargetBuy'].astype('category')
#Split the data on what the model is learning to predict, whether the customer
#brought organic products from the rest of the data
X = df.drop('TargetBuy', axis=1)
y = df['TargetBuy']
#Split the data into training and test sets to be able to train and compare models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)
X_train.info()
y_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 12957 entries, 779 to 18057 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DemAffl 12957 non-null float64 1 DemAge 12957 non-null float64 2 DemCluster 12957 non-null float64 3 DemClusterGroup 12957 non-null int64 4 DemGender 12957 non-null int64 5 DemReg 12957 non-null int64 6 PromClass 12957 non-null int64 7 PromSpend 12957 non-null float64 8 PromTime 12957 non-null float64 dtypes: float64(5), int64(4) memory usage: 1012.3 KB <class 'pandas.core.series.Series'> Int64Index: 12957 entries, 779 to 18057 Series name: TargetBuy Non-Null Count Dtype -------------- ----- 12957 non-null category dtypes: category(1) memory usage: 114.0 KB
#Fit full logistic regression model to examine the significance of the terms
X = X_train
X = sm.add_constant(X)
y=y_train
logit_full = sm.Logit(y, X).fit()
#Print summary
print(logit_full.summary())
Optimization terminated successfully.
Current function value: 0.419787
Iterations 7
Logit Regression Results
==============================================================================
Dep. Variable: TargetBuy No. Observations: 12957
Model: Logit Df Residuals: 12947
Method: MLE Df Model: 9
Date: Tue, 21 Nov 2023 Pseudo R-squ.: 0.2442
Time: 23:05:42 Log-Likelihood: -5439.2
converged: True LL-Null: -7196.7
Covariance Type: nonrobust LLR p-value: 0.000
===================================================================================
coef std err z P>|z| [0.025 0.975]
-----------------------------------------------------------------------------------
const -0.2589 0.160 -1.622 0.105 -0.572 0.054
DemAffl 0.2612 0.008 33.495 0.000 0.246 0.276
DemAge -0.0550 0.002 -25.985 0.000 -0.059 -0.051
DemCluster -0.0010 0.009 -0.111 0.912 -0.018 0.016
DemClusterGroup 0.0334 0.086 0.388 0.698 -0.135 0.202
DemGender -0.9445 0.038 -25.061 0.000 -1.018 -0.871
DemReg -0.0041 0.018 -0.227 0.820 -0.040 0.031
PromClass -0.0446 0.026 -1.735 0.083 -0.095 0.006
PromSpend -1.732e-06 3.92e-06 -0.442 0.659 -9.42e-06 5.96e-06
PromTime 0.0059 0.006 1.026 0.305 -0.005 0.017
===================================================================================
# Generate predicted values for training set
pprob = logit_full.predict(X)
# Create predicted category for success using 50% cutoff
psuccess = (pprob > 0.5).astype(int)
# Add new variables to the training data set
X_train['p_success'] = psuccess
X_train['p_prob'] = pprob
X_train['y'] = y_train
X_train.info()
# Generate predicted values for test set
X_test = sm.add_constant(X_test)
pprob_test = logit_full.predict(X_test)
# Create predicted category for success using 50% cutoff
psuccess_test = (pprob_test > 0.5).astype(int)
# Add new variables to the response data set
X_test['p_success'] = psuccess_test
X_test['p_prob'] = pprob_test
X_test.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 12957 entries, 779 to 18057 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DemAffl 12957 non-null float64 1 DemAge 12957 non-null float64 2 DemCluster 12957 non-null float64 3 DemClusterGroup 12957 non-null int64 4 DemGender 12957 non-null int64 5 DemReg 12957 non-null int64 6 PromClass 12957 non-null int64 7 PromSpend 12957 non-null float64 8 PromTime 12957 non-null float64 9 p_success 12957 non-null int64 10 p_prob 12957 non-null float64 11 y 12957 non-null category dtypes: category(1), float64(6), int64(5) memory usage: 1.2 MB <class 'pandas.core.frame.DataFrame'> Int64Index: 5553 entries, 13248 to 2753 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 const 5553 non-null float64 1 DemAffl 5553 non-null float64 2 DemAge 5553 non-null float64 3 DemCluster 5553 non-null float64 4 DemClusterGroup 5553 non-null int64 5 DemGender 5553 non-null int64 6 DemReg 5553 non-null int64 7 PromClass 5553 non-null int64 8 PromSpend 5553 non-null float64 9 PromTime 5553 non-null float64 10 p_success 5553 non-null int64 11 p_prob 5553 non-null float64 dtypes: float64(7), int64(5) memory usage: 564.0 KB
# Generate confusion matrix for training set
conf_matrix = confusion_matrix(psuccess, y_train)
print(conf_matrix)
# Generate confusion matrix for test set
conf_matrix = confusion_matrix(psuccess_test, y_test)
print(conf_matrix)
[[9250 1814] [ 548 1345]] [[3966 814] [ 236 537]]
# Generate ROC curve for training data
fpr, tpr, thresholds = roc_curve(y_train.cat.codes, pprob)
roc_auc = roc_auc_score(y_train.cat.codes, pprob)
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, pprob_test)
roc_auc = roc_auc_score(y_test.cat.codes, pprob_test)
# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, psuccess)
prec_train = precision_score(y_train, psuccess)
rec_train = recall_score(y_train, psuccess)
# print the scores for the training set
print("Accuracy (Train) : {:.4f}".format(acc_train))
print("Precision (Train): {:.4f}".format(prec_train))
print("Recall (Train) : {:.4f}".format(rec_train))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc_test = accuracy_score(y_test, psuccess_test)
prec_test = precision_score(y_test, psuccess_test)
rec_test = recall_score(y_test, psuccess_test)
# print the scores for the testing set
print("Accuracy (Test) : {:.4f}".format(acc_test))
print("Precision (Test): {:.4f}".format(prec_test))
print("Recall (Test) : {:.4f}".format(rec_test))
print("")
Accuracy (Train) : 0.8177 Precision (Train): 0.7105 Recall (Train) : 0.4258 Accuracy (Test) : 0.8109 Precision (Test): 0.6947 Recall (Test) : 0.3975
#Reload the data
df = pd.read_csv(data_path)
# preview data
df.info()
df.head
print(df)
# total count of NaN values
print(df.isnull().sum())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 22223 non-null int64
1 DemAffl 21138 non-null float64
2 DemAge 20715 non-null float64
3 DemCluster 21549 non-null float64
4 DemClusterGroup 21549 non-null object
5 DemGender 19711 non-null object
6 DemReg 21758 non-null object
7 DemTVReg 21758 non-null object
8 PromClass 22223 non-null object
9 PromSpend 22223 non-null float64
10 PromTime 21942 non-null float64
11 TargetBuy 22223 non-null int64
12 TargetAmt 22223 non-null int64
dtypes: float64(5), int64(3), object(5)
memory usage: 2.2+ MB
ID DemAffl DemAge DemCluster DemClusterGroup DemGender \
0 140 10.0 76.0 16.0 C U
1 620 4.0 49.0 35.0 D U
2 868 5.0 70.0 27.0 D F
3 1120 10.0 65.0 51.0 F M
4 2313 11.0 68.0 4.0 A F
... ... ... ... ... ... ...
22218 52834058 13.0 65.0 9.0 B F
22219 52834376 15.0 73.0 34.0 D U
22220 52837057 9.0 70.0 15.0 B F
22221 52838096 11.0 66.0 8.0 B F
22222 52856469 3.0 75.0 53.0 F NaN
DemReg DemTVReg PromClass PromSpend PromTime TargetBuy \
0 Midlands Wales & West Gold 16000.00 4.0 0
1 Midlands Wales & West Gold 6000.00 5.0 0
2 Midlands Wales & West Silver 0.02 8.0 1
3 Midlands Midlands Tin 0.01 7.0 1
4 Midlands Midlands Tin 0.01 8.0 0
... ... ... ... ... ... ...
22218 South East London Silver 1500.00 5.0 0
22219 South East S & S East Gold 6053.06 12.0 0
22220 North Yorkshire Gold 6000.00 5.0 0
22221 North N West Silver 5000.00 5.0 0
22222 South West S West Gold 6000.00 2.0 0
TargetAmt
0 0
1 0
2 1
3 1
4 0
... ...
22218 0
22219 0
22220 0
22221 0
22222 0
[22223 rows x 13 columns]
ID 0
DemAffl 1085
DemAge 1508
DemCluster 674
DemClusterGroup 674
DemGender 2512
DemReg 465
DemTVReg 465
PromClass 0
PromSpend 0
PromTime 281
TargetBuy 0
TargetAmt 0
dtype: int64
#Replace missing values in gender with unspecified which is a category that
#already exists in this data
df['DemGender'].fillna('U', inplace=True)
#Drop customers with missing data
df.dropna(subset='DemAge', inplace=True)
df.dropna(subset='DemAffl', inplace=True)
df.dropna(subset='DemCluster', inplace=True)
df.dropna(subset='DemReg', inplace=True)
df.dropna(subset='PromTime', inplace=True)
print(df.isnull().sum())
ID 0 DemAffl 0 DemAge 0 DemCluster 0 DemClusterGroup 0 DemGender 0 DemReg 0 DemTVReg 0 PromClass 0 PromSpend 0 PromTime 0 TargetBuy 0 TargetAmt 0 dtype: int64
#Make the categorical columns into dummies using the dummyize function so they can be used in a decision tree and logistic regression models
df["DemGender"] = df["DemGender"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemGender']).columns))
df["DemClusterGroup"] = df["DemClusterGroup"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemClusterGroup']).columns))
df["DemReg"] = df["DemReg"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemReg']).columns))
df["PromClass"] = df["PromClass"].apply(lambda x: dummyize(x, pd.get_dummies(df['PromClass']).columns))
#Drop columns that would be detrimental to machine learning
#ID is the unique identificator for each customer
#TargetAmt gives away if a customer brought organic products
#DemTVReg has too many categories
df.drop(['DemTVReg', 'TargetAmt', 'ID'], axis=1, inplace=True)
#Make TargetBuy of type category so it can be used in training models
df['TargetBuy'] = df['TargetBuy'].astype('category')
#Split the data on what the model is learning to predict, whether the customer
#brought organic products from the rest of the data
X2 = df.drop('TargetBuy', axis=1)
y2 = df['TargetBuy']
#Split the data into training and test sets to be able to train and compare models
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=77)
X2_train.info()
y2_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 12957 entries, 779 to 18057 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DemAffl 12957 non-null float64 1 DemAge 12957 non-null float64 2 DemCluster 12957 non-null float64 3 DemClusterGroup 12957 non-null int64 4 DemGender 12957 non-null int64 5 DemReg 12957 non-null int64 6 PromClass 12957 non-null int64 7 PromSpend 12957 non-null float64 8 PromTime 12957 non-null float64 dtypes: float64(5), int64(4) memory usage: 1012.3 KB <class 'pandas.core.series.Series'> Int64Index: 12957 entries, 779 to 18057 Series name: TargetBuy Non-Null Count Dtype -------------- ----- 12957 non-null category dtypes: category(1) memory usage: 114.0 KB
# Create an Instance of Logistic Regression for LASSO Selection using c = 0.1 and c = 0.01
lr_l1_1 = LogisticRegression(penalty='l1', solver='liblinear', C=0.1)
lr_l1_01 = LogisticRegression(penalty='l1', solver='liblinear', C=0.01)
# fit the models to the training data
lr_l1_1.fit(X2_train, y2_train)
lr_l1_01.fit(X2_train, y2_train)
# Create an Instance of Logistic Regression for LASSO Selection using c = 1 and c = 0.7
lr_l1_10 = LogisticRegression(penalty='l1', solver='liblinear', C=1)
lr_l1_7 = LogisticRegression(penalty='l1', solver='liblinear', C=0.7)
# fit the models to the training data
lr_l1_10.fit(X2_train, y2_train)
lr_l1_7.fit(X2_train, y2_train)
# Create an Instance of Logistic Regression for Ridge Regression (L2 regularization)
lr_l2 = LogisticRegression(penalty='l2', solver='liblinear')
# fit the models to the training data
lr_l2.fit(X2_train, y2_train)
LogisticRegression(solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(solver='liblinear')
# function for model coefficients
def rpt_model_variables(model):
# Get the intercept term
intercept = model.intercept_
# Access the coefficients (weights) of the model, i rounded them
coefficients = np.round(model.coef_[0],decimals=4)
# Create DataFrames for intercept and coefficients
#df_intercept = pd.DataFrame({'feature': ['Intercept'], 'coefficient': [intercept[0]]})
df_coefficients = pd.DataFrame({'feature': X2.columns, 'coefficient': coefficients})
df_coefficients['abs_coefficient'] = df_coefficients['coefficient'].abs()
df_coefficients.sort_values(by='abs_coefficient', ascending=False, inplace=True)
# if you want to add intercept to table
#df_model = pd.concat([df_intercept, df_coefficients]).reset_index(drop=True)
# Print the DataFrame
print(df_coefficients)
return df_coefficients
#Evalute the model coefficients for the models
print("Lasso C=0.1")
df_coefficients1 = rpt_model_variables(lr_l1_1)
print("")
print("Lasso C=0.01")
df_coefficients01 = rpt_model_variables(lr_l1_01)
print("")
print("Lasso C=1")
df_coefficients10 = rpt_model_variables(lr_l1_10)
print("")
print("Lasso C=0.7")
df_coefficients7 = rpt_model_variables(lr_l1_7)
print("")
print("Ridge Regression")
df_coefficients2 = rpt_model_variables(lr_l2)
Lasso C=0.1
feature coefficient abs_coefficient
4 DemGender -0.9322 0.9322
0 DemAffl 0.2572 0.2572
3 DemClusterGroup 0.1210 0.1210
1 DemAge -0.0560 0.0560
6 PromClass -0.0468 0.0468
2 DemCluster -0.0105 0.0105
5 DemReg -0.0057 0.0057
8 PromTime 0.0047 0.0047
7 PromSpend -0.0000 0.0000
Lasso C=0.01
feature coefficient abs_coefficient
4 DemGender -0.8159 0.8159
0 DemAffl 0.2408 0.2408
1 DemAge -0.0567 0.0567
6 PromClass -0.0113 0.0113
2 DemCluster 0.0001 0.0001
3 DemClusterGroup 0.0000 0.0000
5 DemReg 0.0000 0.0000
7 PromSpend 0.0000 0.0000
8 PromTime 0.0000 0.0000
Lasso C=1
feature coefficient abs_coefficient
4 DemGender -0.9420 0.9420
0 DemAffl 0.2619 0.2619
1 DemAge -0.0542 0.0542
6 PromClass -0.0394 0.0394
3 DemClusterGroup -0.0067 0.0067
8 PromTime 0.0056 0.0056
2 DemCluster 0.0032 0.0032
5 DemReg -0.0026 0.0026
7 PromSpend -0.0000 0.0000
Lasso C=0.7
feature coefficient abs_coefficient
4 DemGender -0.9398 0.9398
0 DemAffl 0.2632 0.2632
1 DemAge -0.0534 0.0534
6 PromClass -0.0332 0.0332
8 PromTime 0.0052 0.0052
3 DemClusterGroup 0.0050 0.0050
2 DemCluster 0.0023 0.0023
5 DemReg -0.0005 0.0005
7 PromSpend -0.0000 0.0000
Ridge Regression
feature coefficient abs_coefficient
4 DemGender -0.8877 0.8877
0 DemAffl 0.2501 0.2501
6 PromClass -0.0607 0.0607
1 DemAge -0.0571 0.0571
5 DemReg 0.0192 0.0192
3 DemClusterGroup 0.0151 0.0151
8 PromTime 0.0045 0.0045
2 DemCluster -0.0001 0.0001
7 PromSpend -0.0000 0.0000
# plot variable importance
# function to plot variable importance by creating a bar chart
# of absolute coefficients
def plot_variable_imp(df_coef):
# determine the variables the model is using and create df
# of their absolute coefficients
df_plt = df_coef[df_coef['abs_coefficient'] != 0]
# determine the variables the model is not using
reject_vars = df_coef[df_coef['abs_coefficient'] == 0]['feature'].tolist()
# bar graph of the absolute coefficients that the model is using
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Coefficient')
plt.ylabel('Feature')
sns.barplot(data=df_plt,
y=df_plt['feature'],
x=df_plt['abs_coefficient'], color="lightblue")
plt.show()
# print the variables the model is not using after the bar graph
print("-- rejected --")
for i in reject_vars:
print(f" {i}")
# plot the variable importance for the models
print("Lasso C=0.1")
plot_variable_imp(df_coefficients1)
print("")
print("Lasso C=0.01")
plot_variable_imp(df_coefficients01)
print("")
print("Lasso C=1")
plot_variable_imp(df_coefficients10)
print("")
print("Lasso C=0.7")
plot_variable_imp(df_coefficients7)
print("")
print("Ridge Regression")
plot_variable_imp(df_coefficients2)
Lasso C=0.1
-- rejected -- PromSpend Lasso C=0.01
-- rejected -- DemClusterGroup DemReg PromSpend PromTime Lasso C=1
-- rejected -- PromSpend Lasso C=0.7
-- rejected -- PromSpend Ridge Regression
-- rejected -- PromSpend
# make predictions on the training and testing data for all of the models to
# evaluate the models
#Lasso C=0.1
y_pred_train = lr_l1_1.predict(X2_train)
y_pred_test = lr_l1_1.predict(X2_test)
y_proba_train = lr_l1_1.predict_proba(X2_train)
y_proba_test = lr_l1_1.predict_proba(X2_test)
#Lasso C=0.01
y_pred_train1 = lr_l1_01.predict(X2_train)
y_pred_test1 = lr_l1_01.predict(X2_test)
y_proba_train1 = lr_l1_01.predict_proba(X2_train)
y_proba_test1 = lr_l1_01.predict_proba(X2_test)
#Lasso C=1
y_pred_train10 = lr_l1_10.predict(X2_train)
y_pred_test10 = lr_l1_10.predict(X2_test)
y_proba_train10 = lr_l1_10.predict_proba(X2_train)
y_proba_test10 = lr_l1_10.predict_proba(X2_test)
#Lasso C=0.7
y_pred_train7 = lr_l1_7.predict(X2_train)
y_pred_test7 = lr_l1_7.predict(X2_test)
y_proba_train7 = lr_l1_7.predict_proba(X2_train)
y_proba_test7 = lr_l1_7.predict_proba(X2_test)
#Ridge Regression
y_pred_train2 = lr_l2.predict(X2_train)
y_pred_test2 = lr_l2.predict(X2_test)
y_proba_train2 = lr_l2.predict_proba(X2_train)
y_proba_test2 = lr_l2.predict_proba(X2_test)
L1 with c=0.1
# calculate the accuracy, precision, and recall scores for the training set
acc2_train = accuracy_score(y2_train, y_pred_train)
prec2_train = precision_score(y2_train, y_pred_train)
rec2_train = recall_score(y2_train, y_pred_train)
auc2_train = roc_auc_score(y2_train, y_proba_train[:,1])
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train))
print("Precision: {:.4f}".format(prec2_train))
print("Recall. : {:.4f}".format(rec2_train))
print("AUC : {:.4f}".format(auc2_train))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc2_test = accuracy_score(y2_test, y_pred_test)
prec2_test = precision_score(y2_test, y_pred_test)
rec2_test = recall_score(y2_test, y_pred_test)
auc2_test = roc_auc_score(y2_test, y_proba_test[:,1])
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test))
print("Precision: {:.4f}".format(prec2_test))
print("Recall. : {:.4f}".format(rec2_test))
print("AUC : {:.4f}".format(auc2_test))
-- train set -- Accuracy : 0.8180 Precision: 0.7116 Recall. : 0.4264 AUC : 0.8166 -- test set -- Accuracy : 0.8115 Precision: 0.6939 Recall. : 0.4027 AUC : 0.8008
L1 with c=0.01
# calculate the accuracy, precision, and recall scores for the training set
acc2_train1 = accuracy_score(y2_train, y_pred_train1)
prec2_train1 = precision_score(y2_train, y_pred_train1)
rec2_train1 = recall_score(y2_train, y_pred_train1)
auc2_train1 = roc_auc_score(y2_train, y_proba_train1[:,1])
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train1))
print("Precision: {:.4f}".format(prec2_train1))
print("Recall. : {:.4f}".format(rec2_train1))
print("AUC : {:.4f}".format(auc2_train1))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc2_test1 = accuracy_score(y2_test, y_pred_test1)
prec2_test1 = precision_score(y2_test, y_pred_test1)
rec2_test1 = recall_score(y2_test, y_pred_test1)
auc2_test1 = roc_auc_score(y2_test, y_proba_test1[:,1])
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test1))
print("Precision: {:.4f}".format(prec2_test1))
print("Recall. : {:.4f}".format(rec2_test1))
print("AUC : {:.4f}".format(auc2_test1))
-- train set -- Accuracy : 0.8186 Precision: 0.7224 Recall. : 0.4160 AUC : 0.8153 -- test set -- Accuracy : 0.8118 Precision: 0.7051 Recall. : 0.3893 AUC : 0.8005
L1 with C=1
# calculate the accuracy, precision, and recall scores for the training set
acc2_train10 = accuracy_score(y2_train, y_pred_train10)
prec2_train10 = precision_score(y2_train, y_pred_train10)
rec2_train10 = recall_score(y2_train, y_pred_train10)
auc2_train10 = roc_auc_score(y2_train, y_proba_train10[:,1])
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train10))
print("Precision: {:.4f}".format(prec2_train10))
print("Recall. : {:.4f}".format(rec2_train10))
print("AUC : {:.4f}".format(auc2_train10))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc2_test10 = accuracy_score(y2_test, y_pred_test10)
prec2_test10 = precision_score(y2_test, y_pred_test10)
rec2_test10 = recall_score(y2_test, y_pred_test10)
auc2_test10 = roc_auc_score(y2_test, y_proba_test10[:,1])
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test10))
print("Precision: {:.4f}".format(prec2_test10))
print("Recall. : {:.4f}".format(rec2_test10))
print("AUC : {:.4f}".format(auc2_test10))
-- train set -- Accuracy : 0.8180 Precision: 0.7118 Recall. : 0.4261 AUC : 0.8167 -- test set -- Accuracy : 0.8098 Precision: 0.6913 Recall. : 0.3945 AUC : 0.8009
L1 with C=0.7
# calculate the accuracy, precision, and recall scores for the training set
acc2_train7 = accuracy_score(y2_train, y_pred_train7)
prec2_train7 = precision_score(y2_train, y_pred_train7)
rec2_train7 = recall_score(y2_train, y_pred_train7)
auc2_train7 = roc_auc_score(y2_train, y_proba_train7[:,1])
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train7))
print("Precision: {:.4f}".format(prec2_train7))
print("Recall. : {:.4f}".format(rec2_train7))
print("AUC : {:.4f}".format(auc2_train7))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc2_test7 = accuracy_score(y2_test, y_pred_test7)
prec2_test7 = precision_score(y2_test, y_pred_test7)
rec2_test7 = recall_score(y2_test, y_pred_test7)
auc2_test7 = roc_auc_score(y2_test, y_proba_test7[:,1])
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test7))
print("Precision: {:.4f}".format(prec2_test7))
print("Recall. : {:.4f}".format(rec2_test7))
print("AUC : {:.4f}".format(auc2_test7))
-- train set -- Accuracy : 0.8180 Precision: 0.7127 Recall. : 0.4248 AUC : 0.8167 -- test set -- Accuracy : 0.8100 Precision: 0.6922 Recall. : 0.3945 AUC : 0.8009
L2 regularization
# calculate the accuracy, precision, and recall scores for the training set
acc2_train2 = accuracy_score(y2_train, y_pred_train2)
prec2_train2 = precision_score(y2_train, y_pred_train2)
rec2_train2 = recall_score(y2_train, y_pred_train2)
auc2_train2 = roc_auc_score(y2_train, y_proba_train2[:,1])
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train2))
print("Precision: {:.4f}".format(prec2_train2))
print("Recall. : {:.4f}".format(rec2_train2))
print("AUC : {:.4f}".format(auc2_train2))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc2_test2 = accuracy_score(y2_test, y_pred_test2)
prec2_test2 = precision_score(y2_test, y_pred_test2)
rec2_test2 = recall_score(y2_test, y_pred_test2)
auc2_test2 = roc_auc_score(y2_test, y_proba_test2[:,1])
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test2))
print("Precision: {:.4f}".format(prec2_test2))
print("Recall. : {:.4f}".format(rec2_test2))
print("AUC : {:.4f}".format(auc2_test2))
-- train set -- Accuracy : 0.8185 Precision: 0.7171 Recall. : 0.4220 AUC : 0.8162 -- test set -- Accuracy : 0.8124 Precision: 0.6999 Recall. : 0.4004 AUC : 0.8007
#Reload the data to include data from customers who were dropped for missing
#data in categories that were not used in the original L1 c=0.01 model
df = pd.read_csv(data_path)
# preview data
df.info()
df.head
print(df)
# total count of NaN values
print(df.isnull().sum())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 22223 non-null int64
1 DemAffl 21138 non-null float64
2 DemAge 20715 non-null float64
3 DemCluster 21549 non-null float64
4 DemClusterGroup 21549 non-null object
5 DemGender 19711 non-null object
6 DemReg 21758 non-null object
7 DemTVReg 21758 non-null object
8 PromClass 22223 non-null object
9 PromSpend 22223 non-null float64
10 PromTime 21942 non-null float64
11 TargetBuy 22223 non-null int64
12 TargetAmt 22223 non-null int64
dtypes: float64(5), int64(3), object(5)
memory usage: 2.2+ MB
ID DemAffl DemAge DemCluster DemClusterGroup DemGender \
0 140 10.0 76.0 16.0 C U
1 620 4.0 49.0 35.0 D U
2 868 5.0 70.0 27.0 D F
3 1120 10.0 65.0 51.0 F M
4 2313 11.0 68.0 4.0 A F
... ... ... ... ... ... ...
22218 52834058 13.0 65.0 9.0 B F
22219 52834376 15.0 73.0 34.0 D U
22220 52837057 9.0 70.0 15.0 B F
22221 52838096 11.0 66.0 8.0 B F
22222 52856469 3.0 75.0 53.0 F NaN
DemReg DemTVReg PromClass PromSpend PromTime TargetBuy \
0 Midlands Wales & West Gold 16000.00 4.0 0
1 Midlands Wales & West Gold 6000.00 5.0 0
2 Midlands Wales & West Silver 0.02 8.0 1
3 Midlands Midlands Tin 0.01 7.0 1
4 Midlands Midlands Tin 0.01 8.0 0
... ... ... ... ... ... ...
22218 South East London Silver 1500.00 5.0 0
22219 South East S & S East Gold 6053.06 12.0 0
22220 North Yorkshire Gold 6000.00 5.0 0
22221 North N West Silver 5000.00 5.0 0
22222 South West S West Gold 6000.00 2.0 0
TargetAmt
0 0
1 0
2 1
3 1
4 0
... ...
22218 0
22219 0
22220 0
22221 0
22222 0
[22223 rows x 13 columns]
ID 0
DemAffl 1085
DemAge 1508
DemCluster 674
DemClusterGroup 674
DemGender 2512
DemReg 465
DemTVReg 465
PromClass 0
PromSpend 0
PromTime 281
TargetBuy 0
TargetAmt 0
dtype: int64
#Replace missing values in gender with unspecified which is a category that
#already exists in this data
df['DemGender'].fillna('U', inplace=True)
#Drop customers with data missing in columns that will be used in the model
df.dropna(subset=['DemAffl', 'DemAge', 'DemCluster'], inplace=True)
print(df.isnull().sum()) #Show where there are still missing values
ID 0 DemAffl 0 DemAge 0 DemCluster 0 DemClusterGroup 0 DemGender 0 DemReg 399 DemTVReg 399 PromClass 0 PromSpend 0 PromTime 225 TargetBuy 0 TargetAmt 0 dtype: int64
#Drop the columns that were not used in the previous L1 C=0.01 model
#so we can use the data from customers with values missing in those columns
#
#Drop columns that would be detrimental to machine learning
#ID is the unique identificator for each customer
#TargetAmt gives away if a customer brought organic products
#DemTVReg has too many categories
df.drop(['DemReg', 'DemTVReg', 'PromTime', 'ID', 'TargetAmt'], axis=1, inplace=True)
print(df.isnull().sum())
DemAffl 0 DemAge 0 DemCluster 0 DemClusterGroup 0 DemGender 0 PromClass 0 PromSpend 0 TargetBuy 0 dtype: int64
#Make the categorical columns into dummies using the dummyize function so they can be used in a decision tree and logistic regression models
df["DemGender"] = df["DemGender"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemGender']).columns))
df["DemClusterGroup"] = df["DemClusterGroup"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemClusterGroup']).columns))
df["PromClass"] = df["PromClass"].apply(lambda x: dummyize(x, pd.get_dummies(df['PromClass']).columns))
#Make TargetBuy of type category so it can be used in training models
df['TargetBuy'] = df['TargetBuy'].astype('category')
#Split the data on what the model is learning to predict, whether the customer
#brought organic products from the rest of the data
X2 = df.drop('TargetBuy', axis=1)
y2 = df['TargetBuy']
#Split the data into training and test sets to be able to train and compare models
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=77)
X2_train.info()
y2_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 13390 entries, 9855 to 17474 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DemAffl 13390 non-null float64 1 DemAge 13390 non-null float64 2 DemCluster 13390 non-null float64 3 DemClusterGroup 13390 non-null int64 4 DemGender 13390 non-null int64 5 PromClass 13390 non-null int64 6 PromSpend 13390 non-null float64 dtypes: float64(4), int64(3) memory usage: 836.9 KB <class 'pandas.core.series.Series'> Int64Index: 13390 entries, 9855 to 17474 Series name: TargetBuy Non-Null Count Dtype -------------- ----- 13390 non-null category dtypes: category(1) memory usage: 117.8 KB
#Create the L1 C=0.01 model with the new data
lr_l1_01_2 = LogisticRegression(penalty='l1', solver='liblinear', C=0.01)
lr_l1_01_2.fit(X2_train, y2_train)
LogisticRegression(C=0.01, penalty='l1', solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(C=0.01, penalty='l1', solver='liblinear')
#Evaluate the coefficients
df_coefficients01_2 = rpt_model_variables(lr_l1_01_2)
feature coefficient abs_coefficient 4 DemGender -0.7951 0.7951 0 DemAffl 0.2360 0.2360 1 DemAge -0.0560 0.0560 5 PromClass -0.0018 0.0018 2 DemCluster -0.0005 0.0005 3 DemClusterGroup 0.0000 0.0000 6 PromSpend 0.0000 0.0000
# plot variable importance
#Same function as plot_variable_imp except it prints a larger bar chart
# function to plot variable importance by creating a bar chart
# of absolute coefficients
def plot_variable_imp_larger(df_coef):
# determine the variables the model is using and create df
# of their absolute coefficients
df_plt = df_coef[df_coef['abs_coefficient'] != 0]
# determine the variables the model is not using
reject_vars = df_coef[df_coef['abs_coefficient'] == 0]['feature'].tolist()
# bar graph of the absolute coefficients that the model is using
plt.figure(figsize=(40, 20)) #Create larger chart
plt.title('Variable Importance', fontsize=18)
plt.xlabel('Coefficient', fontsize=18)
plt.ylabel('Feature', fontsize=18)
plt.tick_params(axis='x', which='major', labelsize=18)
plt.tick_params(axis='y', which='major', labelsize=25)
sns.barplot(data=df_plt,
y=df_plt['feature'],
x=df_plt['abs_coefficient'], color="lightblue")
plt.show()
# print the variables the model is not using after the bar graph
print("-- rejected --")
for i in reject_vars:
print(f" {i}")
# plot the variable importance for this model
plot_variable_imp_larger(df_coefficients01_2)
-- rejected -- DemClusterGroup PromSpend
#Make predictions on the training and test data to evaluate the model
y_pred_train01_2 = lr_l1_01_2.predict(X2_train)
y_pred_test01_2 = lr_l1_01_2.predict(X2_test)
y_proba_train01_2 = lr_l1_01_2.predict_proba(X2_train)
y_proba_test01_2 = lr_l1_01_2.predict_proba(X2_test)
# calculate the accuracy, precision, and recall scores for the training set
acc2_train01_2 = accuracy_score(y2_train, y_pred_train01_2)
prec2_train01_2 = precision_score(y2_train, y_pred_train01_2)
rec2_train01_2 = recall_score(y2_train, y_pred_train01_2)
auc2_train01_2 = roc_auc_score(y2_train, y_proba_train01_2[:,1])
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train01_2))
print("Precision: {:.4f}".format(prec2_train01_2))
print("Recall. : {:.4f}".format(rec2_train01_2))
print("AUC : {:.4f}".format(auc2_train01_2))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc2_test01_2 = accuracy_score(y2_test, y_pred_test01_2)
prec2_test01_2 = precision_score(y2_test, y_pred_test01_2)
rec2_test01_2 = recall_score(y2_test, y_pred_test01_2)
auc2_test01_2 = roc_auc_score(y2_test, y_proba_test01_2[:,1])
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test01_2))
print("Precision: {:.4f}".format(prec2_test01_2))
print("Recall. : {:.4f}".format(rec2_test01_2))
print("AUC : {:.4f}".format(auc2_test01_2))
-- train set -- Accuracy : 0.8184 Precision: 0.7268 Recall. : 0.4059 AUC : 0.8094 -- test set -- Accuracy : 0.8116 Precision: 0.7127 Recall. : 0.4000 AUC : 0.8094
#Reload the data to include data from customers who were dropped for missing
#data in categories that were not used in the original L1 c=0.01 model and
#for missing data in DemClusterGroup or neighborhood group since it had
#negligible importance in the model
df = pd.read_csv(data_path)
# preview data
df.info()
df.head
print(df)
# total count of NaN values
print(df.isnull().sum())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 22223 non-null int64
1 DemAffl 21138 non-null float64
2 DemAge 20715 non-null float64
3 DemCluster 21549 non-null float64
4 DemClusterGroup 21549 non-null object
5 DemGender 19711 non-null object
6 DemReg 21758 non-null object
7 DemTVReg 21758 non-null object
8 PromClass 22223 non-null object
9 PromSpend 22223 non-null float64
10 PromTime 21942 non-null float64
11 TargetBuy 22223 non-null int64
12 TargetAmt 22223 non-null int64
dtypes: float64(5), int64(3), object(5)
memory usage: 2.2+ MB
ID DemAffl DemAge DemCluster DemClusterGroup DemGender \
0 140 10.0 76.0 16.0 C U
1 620 4.0 49.0 35.0 D U
2 868 5.0 70.0 27.0 D F
3 1120 10.0 65.0 51.0 F M
4 2313 11.0 68.0 4.0 A F
... ... ... ... ... ... ...
22218 52834058 13.0 65.0 9.0 B F
22219 52834376 15.0 73.0 34.0 D U
22220 52837057 9.0 70.0 15.0 B F
22221 52838096 11.0 66.0 8.0 B F
22222 52856469 3.0 75.0 53.0 F NaN
DemReg DemTVReg PromClass PromSpend PromTime TargetBuy \
0 Midlands Wales & West Gold 16000.00 4.0 0
1 Midlands Wales & West Gold 6000.00 5.0 0
2 Midlands Wales & West Silver 0.02 8.0 1
3 Midlands Midlands Tin 0.01 7.0 1
4 Midlands Midlands Tin 0.01 8.0 0
... ... ... ... ... ... ...
22218 South East London Silver 1500.00 5.0 0
22219 South East S & S East Gold 6053.06 12.0 0
22220 North Yorkshire Gold 6000.00 5.0 0
22221 North N West Silver 5000.00 5.0 0
22222 South West S West Gold 6000.00 2.0 0
TargetAmt
0 0
1 0
2 1
3 1
4 0
... ...
22218 0
22219 0
22220 0
22221 0
22222 0
[22223 rows x 13 columns]
ID 0
DemAffl 1085
DemAge 1508
DemCluster 674
DemClusterGroup 674
DemGender 2512
DemReg 465
DemTVReg 465
PromClass 0
PromSpend 0
PromTime 281
TargetBuy 0
TargetAmt 0
dtype: int64
#Replace missing values in gender with unspecified which is a category that
#already exists in this data
df['DemGender'].fillna('U', inplace=True)
#Drop customers with data missing in columns that will be used in the model
df.dropna(subset=['DemAffl', 'DemAge'], inplace=True)
print(df.isnull().sum()) #Show where there are still missing values
ID 0 DemAffl 0 DemAge 0 DemCluster 585 DemClusterGroup 585 DemGender 0 DemReg 411 DemTVReg 411 PromClass 0 PromSpend 0 PromTime 243 TargetBuy 0 TargetAmt 0 dtype: int64
#Drop the columns that were not used in the original L1 C=0.01 model and
#DemClusterGroup or neighborhood group since it had negligible importance in
#both the original and previous L1 C=0.01 models so we can use the data from
#customers with values missing in those columns
#
#Drop columns that would be detrimental to machine learning
#ID is the unique identificator for each customer
#TargetAmt gives away if a customer brought organic products
#DemTVReg has too many categories
df.drop(['DemReg', 'DemTVReg', 'PromTime', 'ID', 'TargetAmt', 'DemCluster', 'DemClusterGroup'], axis=1, inplace=True)
print(df.isnull().sum())
DemAffl 0 DemAge 0 DemGender 0 PromClass 0 PromSpend 0 TargetBuy 0 dtype: int64
#Make the categorical columns into dummies using the dummyize function so they can be used in a decision tree and logistic regression models
df["DemGender"] = df["DemGender"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemGender']).columns))
df["PromClass"] = df["PromClass"].apply(lambda x: dummyize(x, pd.get_dummies(df['PromClass']).columns))
#Make TargetBuy of type category so it can be used in training models
df['TargetBuy'] = df['TargetBuy'].astype('category')
#Split the data on what the model is learning to predict, whether the customer
#brought organic products from the rest of the data
X2 = df.drop('TargetBuy', axis=1)
y2 = df['TargetBuy']
#Split the data into training and test sets to be able to train and compare models
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=77)
X2_train.info()
y2_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 13799 entries, 9631 to 16966 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DemAffl 13799 non-null float64 1 DemAge 13799 non-null float64 2 DemGender 13799 non-null int64 3 PromClass 13799 non-null int64 4 PromSpend 13799 non-null float64 dtypes: float64(3), int64(2) memory usage: 646.8 KB <class 'pandas.core.series.Series'> Int64Index: 13799 entries, 9631 to 16966 Series name: TargetBuy Non-Null Count Dtype -------------- ----- 13799 non-null category dtypes: category(1) memory usage: 121.4 KB
#Create the L1 C=0.01 model with the new data
lr_l1_01_3 = LogisticRegression(penalty='l1', solver='liblinear', C=0.01)
lr_l1_01_3.fit(X2_train, y2_train)
LogisticRegression(C=0.01, penalty='l1', solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(C=0.01, penalty='l1', solver='liblinear')
#Evaluate the coefficients
df_coefficients01_3 = rpt_model_variables(lr_l1_01_3)
feature coefficient abs_coefficient 2 DemGender -0.7904 0.7904 0 DemAffl 0.2337 0.2337 1 DemAge -0.0547 0.0547 3 PromClass 0.0000 0.0000 4 PromSpend -0.0000 0.0000
# plot the variable importance for this model
plot_variable_imp(df_coefficients01_3)
-- rejected -- PromClass PromSpend
#Make predictions on the training and test data to evaluate the model
y_pred_train01_3 = lr_l1_01_3.predict(X2_train)
y_pred_test01_3 = lr_l1_01_3.predict(X2_test)
y_proba_train01_3 = lr_l1_01_3.predict_proba(X2_train)
y_proba_test01_3 = lr_l1_01_3.predict_proba(X2_test)
# calculate the accuracy, precision, and recall scores for the training set
acc2_train01_3 = accuracy_score(y2_train, y_pred_train01_3)
prec2_train01_3 = precision_score(y2_train, y_pred_train01_3)
rec2_train01_3 = recall_score(y2_train, y_pred_train01_3)
auc2_train01_3 = roc_auc_score(y2_train, y_proba_train01_3[:,1])
# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train01_3))
print("Precision: {:.4f}".format(prec2_train01_3))
print("Recall. : {:.4f}".format(rec2_train01_3))
print("AUC : {:.4f}".format(auc2_train01_3))
print("")
# calculate the accuracy, precision, and recall scores for the testing set
acc2_test01_3 = accuracy_score(y2_test, y_pred_test01_3)
prec2_test01_3 = precision_score(y2_test, y_pred_test01_3)
rec2_test01_3 = recall_score(y2_test, y_pred_test01_3)
auc2_test01_3 = roc_auc_score(y2_test, y_proba_test01_3[:,1])
# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test01_3))
print("Precision: {:.4f}".format(prec2_test01_3))
print("Recall. : {:.4f}".format(rec2_test01_3))
print("AUC : {:.4f}".format(auc2_test01_3))
-- train set -- Accuracy : 0.8139 Precision: 0.7240 Recall. : 0.4118 AUC : 0.8102 -- test set -- Accuracy : 0.8220 Precision: 0.7077 Recall. : 0.4168 AUC : 0.8086
# load the data into a pandas dataframe
data_path = '/content/drive/My Drive/New_organics.csv'
dfnew = pd.read_csv(data_path)
# preview data
dfnew.info()
dfnew.head
print(dfnew)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 150 non-null int64
1 DemAffl 150 non-null int64
2 DemAge 150 non-null int64
3 DemCluster 150 non-null int64
4 DemClusterGroup 150 non-null object
5 DemGender 150 non-null object
6 DemReg 150 non-null object
7 DemTVReg 150 non-null object
8 PromClass 150 non-null object
9 PromSpend 150 non-null float64
10 PromTime 150 non-null int64
dtypes: float64(1), int64(5), object(5)
memory usage: 13.0+ KB
ID DemAffl DemAge DemCluster DemClusterGroup DemGender DemReg \
0 1 6 46 27 D F South East
1 2 10 39 12 B M Midlands
2 3 7 75 12 B M Midlands
3 4 7 72 12 B F Midlands
4 5 7 43 53 F M North
.. ... ... ... ... ... ... ...
145 146 3 59 12 B F Midlands
146 147 8 69 16 C M South East
147 158 7 27 52 F F South East
148 149 10 63 14 B M South East
149 150 18 41 35 D F North
DemTVReg PromClass PromSpend PromTime
0 London Tin 0.01 8
1 Ulster Tin 0.01 5
2 East Silver 4000.00 7
3 Ulster Silver 1250.00 23
4 N West Tin 0.01 5
.. ... ... ... ...
145 East Gold 12000.00 4
146 London Gold 14400.00 8
147 London Tin 0.01 4
148 London Silver 3000.00 7
149 N East Tin 0.01 5
[150 rows x 11 columns]
#Drop columns that were not used in the modified tuned decision tree so it can
#score this new data
dfnew.drop(['DemCluster', 'DemClusterGroup', 'DemReg', 'DemTVReg', 'PromTime', 'ID'], axis=1, inplace=True)
print(dfnew.isnull().sum())
DemAffl 0 DemAge 0 DemGender 0 PromClass 0 PromSpend 0 dtype: int64
#Make the categorical columns into dummies using the dummyize function so they
#can be scored by the modified tuned decision tree
dfnew["DemGender"] = dfnew["DemGender"].apply(lambda x: dummyize(x, pd.get_dummies(dfnew['DemGender']).columns))
dfnew["PromClass"] = dfnew["PromClass"].apply(lambda x: dummyize(x, pd.get_dummies(dfnew['PromClass']).columns))
# make predictions on the new data
Scored_class = dt_tuned_mod_more_data.predict(dfnew)
Scored_probs = dt_tuned_mod_more_data.predict_proba(dfnew)
# Make the predictions and probabilities into a dataframe
dfnew['pred_Buy'] = Scored_class
dfnew['prod_prob_buy']=Scored_probs[:,1]
print(dfnew) # Print the new dataframe
DemAffl DemAge DemGender PromClass PromSpend pred_Buy prod_prob_buy 0 6 46 0 3 0.01 0 0.144857 1 10 39 1 3 0.01 0 0.377709 2 7 75 1 2 4000.00 0 0.052396 3 7 72 0 2 1250.00 0 0.144857 4 7 43 1 3 0.01 0 0.189076 .. ... ... ... ... ... ... ... 145 3 59 0 0 12000.00 0 0.069240 146 8 69 1 0 14400.00 0 0.052396 147 7 27 0 3 0.01 1 0.528881 148 10 63 1 2 3000.00 0 0.127919 149 18 41 0 3 0.01 1 0.809524 [150 rows x 7 columns]
# Make the new dataframe with the predictions and probabilities into an Excel file
scored_file = 'NewOrganicCustomerScores.xlsx'
dfnew.to_excel(scored_file)
#Reload the data into a dataframe called df_analysis to do analysis with the
#original data without any changes made for machine learning
data_path = '/content/drive/MyDrive/organics.csv'
df_analysis = pd.read_csv(data_path)
# preview data
df_analysis.info()
df_analysis.head
print(df_analysis)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 22223 non-null int64
1 DemAffl 21138 non-null float64
2 DemAge 20715 non-null float64
3 DemCluster 21549 non-null float64
4 DemClusterGroup 21549 non-null object
5 DemGender 19711 non-null object
6 DemReg 21758 non-null object
7 DemTVReg 21758 non-null object
8 PromClass 22223 non-null object
9 PromSpend 22223 non-null float64
10 PromTime 21942 non-null float64
11 TargetBuy 22223 non-null int64
12 TargetAmt 22223 non-null int64
dtypes: float64(5), int64(3), object(5)
memory usage: 2.2+ MB
ID DemAffl DemAge DemCluster DemClusterGroup DemGender \
0 140 10.0 76.0 16.0 C U
1 620 4.0 49.0 35.0 D U
2 868 5.0 70.0 27.0 D F
3 1120 10.0 65.0 51.0 F M
4 2313 11.0 68.0 4.0 A F
... ... ... ... ... ... ...
22218 52834058 13.0 65.0 9.0 B F
22219 52834376 15.0 73.0 34.0 D U
22220 52837057 9.0 70.0 15.0 B F
22221 52838096 11.0 66.0 8.0 B F
22222 52856469 3.0 75.0 53.0 F NaN
DemReg DemTVReg PromClass PromSpend PromTime TargetBuy \
0 Midlands Wales & West Gold 16000.00 4.0 0
1 Midlands Wales & West Gold 6000.00 5.0 0
2 Midlands Wales & West Silver 0.02 8.0 1
3 Midlands Midlands Tin 0.01 7.0 1
4 Midlands Midlands Tin 0.01 8.0 0
... ... ... ... ... ... ...
22218 South East London Silver 1500.00 5.0 0
22219 South East S & S East Gold 6053.06 12.0 0
22220 North Yorkshire Gold 6000.00 5.0 0
22221 North N West Silver 5000.00 5.0 0
22222 South West S West Gold 6000.00 2.0 0
TargetAmt
0 0
1 0
2 1
3 1
4 0
... ...
22218 0
22219 0
22220 0
22221 0
22222 0
[22223 rows x 13 columns]
#Get the average amount spent, time spent, age, and affluence for each loyalty status
df_analysis.groupby('PromClass')[['PromSpend', 'PromTime', 'DemAge', 'DemAffl']].mean()
| PromSpend | PromTime | DemAge | DemAffl | |
|---|---|---|---|---|
| PromClass | ||||
| Gold | 8795.965194 | 6.418389 | 59.553901 | 8.518170 |
| Platinum | 30399.233869 | 6.974668 | 62.288973 | 8.550186 |
| Silver | 1992.261697 | 6.770860 | 53.652875 | 8.716769 |
| Tin | 0.010000 | 6.379727 | 47.222038 | 8.916815 |
#Get the average amount spent, time spent, age, and affluence for each loyalty status
#for customers 40 years old and younger
df_analysis[df_analysis['DemAge'] <= 40].groupby('PromClass')[['PromSpend', 'PromTime', 'DemAffl']].mean()
| PromSpend | PromTime | DemAffl | |
|---|---|---|---|
| PromClass | |||
| Gold | 8157.519794 | 4.615925 | 9.995192 |
| Platinum | 35729.350294 | 5.312500 | 9.343750 |
| Silver | 1502.637701 | 5.120321 | 9.754848 |
| Tin | 0.010000 | 5.862756 | 9.815940 |
#Get the minimum amount spent, time spent, age, and affluence for each loyalty status
df_analysis.groupby('PromClass')[['PromSpend', 'PromTime', 'DemAffl']].min()
| PromSpend | PromTime | DemAffl | |
|---|---|---|---|
| PromClass | |||
| Gold | 5000.01 | 0.0 | 0.0 |
| Platinum | 20000.01 | 0.0 | 2.0 |
| Silver | 0.02 | 0.0 | 0.0 |
| Tin | 0.01 | 0.0 | 0.0 |
#Get the maximum amount spent, time spent, age, and affluence for each loyalty status
df_analysis.groupby('PromClass')[['PromSpend', 'PromTime', 'DemAffl']].max()
| PromSpend | PromTime | DemAffl | |
|---|---|---|---|
| PromClass | |||
| Gold | 20000.00 | 38.0 | 29.0 |
| Platinum | 296313.85 | 32.0 | 21.0 |
| Silver | 5000.00 | 39.0 | 34.0 |
| Tin | 0.01 | 35.0 | 28.0 |
#Determine how many customers are in each loyalty status
df_analysis.groupby('PromClass')['ID'].count()
PromClass Gold 6324 Platinum 840 Silver 8572 Tin 6487 Name: ID, dtype: int64
#Get the percentage of customers who did and did not buy organic products for
#each loyalty status
promclass_buy = df_analysis.groupby(["PromClass", "TargetBuy"])["ID"].count() / df_analysis.groupby(["PromClass"])["ID"].count()
promclass_buy
PromClass TargetBuy
Gold 0 0.804554
1 0.195446
Platinum 0 0.847619
1 0.152381
Silver 0 0.753616
1 0.246384
Tin 0 0.687221
1 0.312779
Name: ID, dtype: float64
#Only get the percentage of customers who did buy organic products for
#each loyalty status, keep column names accurate and make into a dataframe
promclass_buy = promclass_buy.reset_index()
promclass_buy = promclass_buy[promclass_buy['TargetBuy'] == 1].drop('TargetBuy', axis=1)
promclass_buy = promclass_buy.rename(columns={'ID': 'Percentage'})
promclass_buy
| PromClass | Percentage | |
|---|---|---|
| 1 | Gold | 0.195446 |
| 3 | Platinum | 0.152381 |
| 5 | Silver | 0.246384 |
| 7 | Tin | 0.312779 |
#Plot the percentage of customers who did buy organic products for each loyalty status
plt.figure(figsize=(20, 10)) #Have a larger size for presentations
plot = sns.barplot(data=promclass_buy, #Color the platinum and tin bars and make the other bars grey to point out the difference between the highest and lowest loyalty statuses. Order the bars from lowest to highest percentage
x='PromClass', y='Percentage', palette=['peru' if (x == 'Platinum' or x == 'Tin') else 'lightgrey' for x in promclass_buy.sort_values('Percentage')['PromClass']], order=promclass_buy.sort_values('Percentage')['PromClass'])
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the column chart a title
plt.title("Percentage of Customers Buying Organic Products by Loyalty Class", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)
sns.despine() #Do not have a top and right side border
# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought organic
# products in that loyalty status
for p in plot.patches:
# get the height of each bar
height = p.get_height()
# adding text to each bar
plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
y = height+0.02, # y-coordinate position of data label, padded 0.02 above bar
s = '{:.0%}'.format(height), # data label format of the percentage
fontsize = 14,
ha = 'center') # sets horizontal alignment (ha) to center
plt.show()
#Get the percentage of customers who brought two or more organic products in each loyalty status
promclass_buy2 = df_analysis[df_analysis["TargetAmt"] >= 2].groupby(["PromClass"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["PromClass"])["ID"].count()
promclass_buy2
PromClass Gold 0.135922 Platinum 0.062500 Silver 0.158144 Tin 0.182356 Name: ID, dtype: float64
#Make into dataframe and keep column names accurate
promclass_buy2 = promclass_buy2.reset_index()
promclass_buy2 = promclass_buy2.rename(columns={'ID': 'Percentage'})
promclass_buy2
| PromClass | Percentage | |
|---|---|---|
| 0 | Gold | 0.135922 |
| 1 | Platinum | 0.062500 |
| 2 | Silver | 0.158144 |
| 3 | Tin | 0.182356 |
#Plot the percentage of customers who brought two or more organic products for each loyalty status
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=promclass_buy2,
x='PromClass', y='Percentage', color='lightblue')
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Give the column chart a title
plt.title("Percentage of Customers Buying Two or More Organic Products by Loyalty Class", fontsize = 20)
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)
sns.despine() #Do not have a top and right side border
# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought two or
# more organic products in that loyalty status
for p in plot.patches:
# get the height of each bar
height = p.get_height()
# adding text to each bar
plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
y = height+0.02, # y-coordinate position of data label, padded 0.02 above bar
s = '{:.0%}'.format(height), # data label format of the percentage
fontsize = 14,
ha = 'center') # sets horizontal alignment (ha) to center
plt.show()
#Get the percentage of customers who brought three organic products in each loyalty status
promclass_buy3 = df_analysis[df_analysis["TargetAmt"] >= 3].groupby(["PromClass"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["PromClass"])["ID"].count()
promclass_buy3
PromClass Gold 0.018608 Platinum 0.007812 Silver 0.033144 Tin 0.034993 Name: ID, dtype: float64
#Make into dataframe and keep column names accurate
promclass_buy3 = promclass_buy3.reset_index()
promclass_buy3 = promclass_buy3.rename(columns={'ID': 'Percentage'})
promclass_buy3
| PromClass | Percentage | |
|---|---|---|
| 0 | Gold | 0.018608 |
| 1 | Platinum | 0.007812 |
| 2 | Silver | 0.033144 |
| 3 | Tin | 0.034993 |
#Plot the percentage of customers who brought three organic products for each loyalty status
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=promclass_buy3,
x='PromClass', y='Percentage', color='lightblue')
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the column chart a title
plt.title("Percentage of Customers Buying Three Organic Products by Loyalty Class", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)
sns.despine() #Do not have a top and right side border
# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought three organic
# products in that loyalty status
for p in plot.patches:
# get the height of each bar
height = p.get_height()
# adding text to each bar
plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
y = height+0.02, # y-coordinate position of data label, padded 0.02 above bar
s = '{:.0%}'.format(height), # data label format of the percentage
fontsize = 14,
ha = 'center') # sets horizontal alignment (ha) to center
plt.show()
#Get the average age of customers for each loyalty status
promclass_age = df_analysis.groupby('PromClass')['DemAge'].mean().reset_index()
promclass_age
| PromClass | DemAge | |
|---|---|---|
| 0 | Gold | 59.553901 |
| 1 | Platinum | 62.288973 |
| 2 | Silver | 53.652875 |
| 3 | Tin | 47.222038 |
#Plot the average age of customers for each loyalty status
plt.figure(figsize=(20, 10)) #Have a larger size for presentations
plot = sns.barplot(data=promclass_age, #Color the platinum and tin bars and make the other bars grey to point out the difference between the highest and lowest loyalty statuses. Order the bars from lowest to highest average age
x='PromClass', y='DemAge', palette=['peru' if (x == 'Platinum' or x == 'Tin') else 'lightgrey' for x in promclass_age.sort_values('DemAge')['PromClass']], order=promclass_age.sort_values('DemAge')['PromClass'])
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Give the column chart a title
plt.title("Average Age of Customers by Loyalty Class", fontsize = 20)
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)
sns.despine() #Do not have a top and right side border
# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the average age of the customers in that loyalty status
for p in plot.patches:
# get the height of each bar
height = p.get_height()
# adding text to each bar
plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
y = height+1, # y-coordinate position of data label, padded 1 above bar
s = '{:.0f}'.format(height), # data label format of the average age
fontsize = 14,
ha = 'center') # sets horizontal alignment (ha) to center
plt.show()
#Get the percentage of customers who did and did not buy organic products by gender
gender_buy = df_analysis.groupby(["DemGender", "TargetBuy"])["ID"].count() / df_analysis.groupby(["DemGender"])["ID"].count()
gender_buy
DemGender TargetBuy
F 0 0.653881
1 0.346119
M 0 0.833878
1 0.166122
U 0 0.939897
1 0.060103
Name: ID, dtype: float64
#Only get the percentage of customers who did buy organic products by gender,
#keep the column names accurate and make into a dataframe
gender_buy = gender_buy.reset_index()
gender_buy = gender_buy[gender_buy['TargetBuy'] == 1].drop('TargetBuy', axis=1)
gender_buy = gender_buy.replace({'F': 'Women', 'M': 'Men', 'U': 'Unspecified'}) #Replace one letter with words for genders
gender_buy = gender_buy.rename(columns={'ID': 'Percentage'})
gender_buy
| DemGender | Percentage | |
|---|---|---|
| 1 | Women | 0.346119 |
| 3 | Men | 0.166122 |
| 5 | Unspecified | 0.060103 |
#Plot the percentage of customers who did buy organic products by gender
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=gender_buy, #Color the bar for women blue and make the others grey to point out how women are more likely to buy organic products
x='DemGender', y='Percentage', palette=['blue' if x == 'Women' else 'lightgrey' for x in gender_buy['DemGender']])
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the column chart a title
plt.title("Percentage of Customers Buying Organic Products by Gender", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)
sns.despine() #Do not have a top and right side border
# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought organic
# products for that gender
for p in plot.patches:
# get the height of each bar
height = p.get_height()
# adding text to each bar
plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
y = height+0.02, # y-coordinate position of data label, padded 0.02 above bar
s = '{:.0%}'.format(height), # data label format of the percentage
fontsize = 14,
ha = 'center') # sets horizontal alignment (ha) to center
plt.show()
#Get the percentage of customers who brought two or more organic products by gender
gender_buy2 = df_analysis[df_analysis["TargetAmt"] >= 2].groupby(["DemGender"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["DemGender"])["ID"].count()
gender_buy2
DemGender F 0.158383 M 0.150104 U 0.133333 Name: ID, dtype: float64
#Make into dataframe and keep column names accurate
gender_buy2 = gender_buy2.reset_index()
gender_buy2 = gender_buy2.replace({'F': 'Women', 'M': 'Men', 'U': 'Unspecified'}) #Replace one letter with words for genders
gender_buy2 = gender_buy2.rename(columns={'ID': 'Percentage'})
gender_buy2
| DemGender | Percentage | |
|---|---|---|
| 0 | Women | 0.158383 |
| 1 | Men | 0.150104 |
| 2 | Unspecified | 0.133333 |
#Plot the percentage of customers who brought two or more organic products by gender
plt.figure(figsize=(20, 5)) #Make half length to take up half of slide
plot = sns.barplot(data=gender_buy2,
x='DemGender', y='Percentage', color='lightblue')
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the column chart a title
plt.title("2 Or More Products", fontsize=16)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)
sns.despine() #Do not have a top and right side border
# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought two or more organic
# products for that gender
for p in plot.patches:
# get the height of each bar
height = p.get_height()
# adding text to each bar
plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
y = height+0.02, # y-coordinate position of data label, padded 0.02 above bar
s = '{:.0%}'.format(height), # data label format of the percentage
fontsize = 14,
ha = 'center') # sets horizontal alignment (ha) to center
plt.show()
#Get the percentage of customers who brought three organic products by gender
gender_buy3 = df_analysis[df_analysis["TargetAmt"] >= 3].groupby(["DemGender"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["DemGender"])["ID"].count()
gender_buy3
DemGender F 0.030440 M 0.027950 U 0.019048 Name: ID, dtype: float64
#Make into dataframe and keep column names accurate
gender_buy3 = gender_buy3.reset_index()
gender_buy3 = gender_buy3.replace({'F': 'Women', 'M': 'Men', 'U': 'Unspecified'}) #Replace one letter with words for genders
gender_buy3 = gender_buy3.rename(columns={'ID': 'Percentage'})
gender_buy3
| DemGender | Percentage | |
|---|---|---|
| 0 | Women | 0.030440 |
| 1 | Men | 0.027950 |
| 2 | Unspecified | 0.019048 |
#Plot the percentage of customers who brought three organic products by gender
plt.figure(figsize=(20, 5)) #Make half length to take up half of slide
plot = sns.barplot(data=gender_buy3,
x='DemGender', y='Percentage', color='lightblue')
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the column chart a title
plt.title("3 Products", fontsize=16)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)
sns.despine() #Do not have a top and right side border
# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought three organic
# products for that gender
for p in plot.patches:
# get the height of each bar
height = p.get_height()
# adding text to each bar
plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
y = height+0.02, # y-coordinate position of data label, padded 0.02 above bar
s = '{:.0%}'.format(height), # data label format of the percentage
fontsize = 14,
ha = 'center') # sets horizontal alignment (ha) to center
plt.show()
#Get the percentage of customers who did and did not buy organic products by age
age_buy = df_analysis.groupby(["DemAge", "TargetBuy"])["ID"].count() / df_analysis.groupby(["DemAge"])["ID"].count()
age_buy
DemAge TargetBuy
18.0 0 1.000000
19.0 0 1.000000
20.0 0 0.300000
1 0.700000
21.0 0 0.500000
...
77.0 1 0.192593
78.0 0 0.824390
1 0.175610
79.0 0 0.825503
1 0.174497
Name: ID, Length: 122, dtype: float64
#Only get the percentage of customers who did buy organic products by age,
#keep the column names accurate and make into a dataframe
age_buy = age_buy.reset_index()
age_buy = age_buy[age_buy['TargetBuy'] == 1].drop('TargetBuy', axis=1)
age_buy = age_buy.rename(columns={'ID': 'Percentage'})
age_buy
| DemAge | Percentage | |
|---|---|---|
| 3 | 20.0 | 0.700000 |
| 5 | 21.0 | 0.500000 |
| 7 | 22.0 | 0.611111 |
| 9 | 23.0 | 0.652174 |
| 11 | 24.0 | 0.714286 |
| 13 | 25.0 | 0.543478 |
| 15 | 26.0 | 0.638298 |
| 17 | 27.0 | 0.671053 |
| 19 | 28.0 | 0.509615 |
| 21 | 29.0 | 0.626016 |
| 23 | 30.0 | 0.559524 |
| 25 | 31.0 | 0.530806 |
| 27 | 32.0 | 0.645570 |
| 29 | 33.0 | 0.542662 |
| 31 | 34.0 | 0.621053 |
| 33 | 35.0 | 0.594828 |
| 35 | 36.0 | 0.593373 |
| 37 | 37.0 | 0.615764 |
| 39 | 38.0 | 0.580838 |
| 41 | 39.0 | 0.626087 |
| 43 | 40.0 | 0.382199 |
| 45 | 41.0 | 0.396947 |
| 47 | 42.0 | 0.410526 |
| 49 | 43.0 | 0.403226 |
| 51 | 44.0 | 0.408889 |
| 53 | 45.0 | 0.042316 |
| 55 | 46.0 | 0.171367 |
| 57 | 47.0 | 0.156604 |
| 59 | 48.0 | 0.138258 |
| 61 | 49.0 | 0.144250 |
| 63 | 50.0 | 0.131757 |
| 65 | 51.0 | 0.155738 |
| 67 | 52.0 | 0.153704 |
| 69 | 53.0 | 0.135940 |
| 71 | 54.0 | 0.155340 |
| 73 | 55.0 | 0.164356 |
| 75 | 56.0 | 0.162946 |
| 77 | 57.0 | 0.132479 |
| 79 | 58.0 | 0.153374 |
| 81 | 59.0 | 0.150980 |
| 83 | 60.0 | 0.131429 |
| 85 | 61.0 | 0.174797 |
| 87 | 62.0 | 0.150327 |
| 89 | 63.0 | 0.135560 |
| 91 | 64.0 | 0.156522 |
| 93 | 65.0 | 0.137860 |
| 95 | 66.0 | 0.133047 |
| 97 | 67.0 | 0.124717 |
| 99 | 68.0 | 0.190931 |
| 101 | 69.0 | 0.154450 |
| 103 | 70.0 | 0.148541 |
| 105 | 71.0 | 0.141243 |
| 107 | 72.0 | 0.133127 |
| 109 | 73.0 | 0.142857 |
| 111 | 74.0 | 0.141538 |
| 113 | 75.0 | 0.169811 |
| 115 | 76.0 | 0.135762 |
| 117 | 77.0 | 0.192593 |
| 119 | 78.0 | 0.175610 |
| 121 | 79.0 | 0.174497 |
#Plot the percentage of customers who did buy organic products by age
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=age_buy,
x='DemAge', y='Percentage', linewidth=3)
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
#Give the line graph a title
plt.title("Age of Customers and Percentage Buying Organic Products", fontsize = 20)
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
plt.xlim(19,80) #Have the ages on the x-axis be from 19 to 80 years old
sns.despine() #Do not have a top and right side border
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plt.show()
#Get the percentage of customers who brought two or more organic products by age
age_buy2 = df_analysis[df_analysis["TargetAmt"] >= 2].groupby(["DemAge"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["DemAge"])["ID"].count()
age_buy2
DemAge 20.0 NaN 21.0 0.500000 22.0 0.272727 23.0 0.133333 24.0 0.133333 25.0 0.200000 26.0 0.133333 27.0 0.098039 28.0 0.094340 29.0 0.129870 30.0 0.191489 31.0 0.178571 32.0 0.143791 33.0 0.213836 34.0 0.209040 35.0 0.188406 36.0 0.203046 37.0 0.172000 38.0 0.159794 39.0 0.143519 40.0 0.068493 41.0 0.173077 42.0 0.089744 43.0 0.091429 44.0 0.130435 45.0 1.000000 46.0 0.303797 47.0 0.228916 48.0 0.356164 49.0 0.310811 50.0 0.346154 51.0 0.378947 52.0 0.289157 53.0 0.397260 54.0 0.287500 55.0 0.253012 56.0 0.260274 57.0 0.338710 58.0 0.320000 59.0 0.363636 60.0 NaN 61.0 NaN 62.0 NaN 63.0 NaN 64.0 NaN 65.0 NaN 66.0 NaN 67.0 NaN 68.0 NaN 69.0 NaN 70.0 NaN 71.0 NaN 72.0 NaN 73.0 NaN 74.0 NaN 75.0 NaN 76.0 NaN 77.0 NaN 78.0 NaN 79.0 NaN Name: ID, dtype: float64
#Make into dataframe and keep column names accurate
age_buy2 = age_buy2.reset_index()
age_buy2 = age_buy2.rename(columns={'ID': 'Percentage'})
age_buy2['Percentage'].fillna(value=0, inplace=True) #Replace null with 0%
age_buy2
| DemAge | Percentage | |
|---|---|---|
| 0 | 20.0 | 0.000000 |
| 1 | 21.0 | 0.500000 |
| 2 | 22.0 | 0.272727 |
| 3 | 23.0 | 0.133333 |
| 4 | 24.0 | 0.133333 |
| 5 | 25.0 | 0.200000 |
| 6 | 26.0 | 0.133333 |
| 7 | 27.0 | 0.098039 |
| 8 | 28.0 | 0.094340 |
| 9 | 29.0 | 0.129870 |
| 10 | 30.0 | 0.191489 |
| 11 | 31.0 | 0.178571 |
| 12 | 32.0 | 0.143791 |
| 13 | 33.0 | 0.213836 |
| 14 | 34.0 | 0.209040 |
| 15 | 35.0 | 0.188406 |
| 16 | 36.0 | 0.203046 |
| 17 | 37.0 | 0.172000 |
| 18 | 38.0 | 0.159794 |
| 19 | 39.0 | 0.143519 |
| 20 | 40.0 | 0.068493 |
| 21 | 41.0 | 0.173077 |
| 22 | 42.0 | 0.089744 |
| 23 | 43.0 | 0.091429 |
| 24 | 44.0 | 0.130435 |
| 25 | 45.0 | 1.000000 |
| 26 | 46.0 | 0.303797 |
| 27 | 47.0 | 0.228916 |
| 28 | 48.0 | 0.356164 |
| 29 | 49.0 | 0.310811 |
| 30 | 50.0 | 0.346154 |
| 31 | 51.0 | 0.378947 |
| 32 | 52.0 | 0.289157 |
| 33 | 53.0 | 0.397260 |
| 34 | 54.0 | 0.287500 |
| 35 | 55.0 | 0.253012 |
| 36 | 56.0 | 0.260274 |
| 37 | 57.0 | 0.338710 |
| 38 | 58.0 | 0.320000 |
| 39 | 59.0 | 0.363636 |
| 40 | 60.0 | 0.000000 |
| 41 | 61.0 | 0.000000 |
| 42 | 62.0 | 0.000000 |
| 43 | 63.0 | 0.000000 |
| 44 | 64.0 | 0.000000 |
| 45 | 65.0 | 0.000000 |
| 46 | 66.0 | 0.000000 |
| 47 | 67.0 | 0.000000 |
| 48 | 68.0 | 0.000000 |
| 49 | 69.0 | 0.000000 |
| 50 | 70.0 | 0.000000 |
| 51 | 71.0 | 0.000000 |
| 52 | 72.0 | 0.000000 |
| 53 | 73.0 | 0.000000 |
| 54 | 74.0 | 0.000000 |
| 55 | 75.0 | 0.000000 |
| 56 | 76.0 | 0.000000 |
| 57 | 77.0 | 0.000000 |
| 58 | 78.0 | 0.000000 |
| 59 | 79.0 | 0.000000 |
#Plot the percentage of customers who brought two or more organic products by age
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=age_buy2,
x='DemAge', y='Percentage', linewidth=3)
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
plt.xlim(19,80) #Have the ages on the x-axis be from 19 to 80 years old
#Give the line graph a title
plt.title("Age of Customers Buying Organic Products and Percentage Buying Two Or More Organic Products", fontsize = 20)
sns.despine() #Do not have a top and right side border
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plt.show()
#Get the percentage of customers who brought three organic products by age
age_buy3 = df_analysis[df_analysis["TargetAmt"] >= 3].groupby(["DemAge"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["DemAge"])["ID"].count()
age_buy3
DemAge 20.0 NaN 21.0 NaN 22.0 0.181818 23.0 0.066667 24.0 0.133333 25.0 0.080000 26.0 0.066667 27.0 0.019608 28.0 0.056604 29.0 0.051948 30.0 0.063830 31.0 0.053571 32.0 0.078431 33.0 0.106918 34.0 0.067797 35.0 0.082126 36.0 0.091371 37.0 0.080000 38.0 0.082474 39.0 0.055556 40.0 NaN 41.0 NaN 42.0 NaN 43.0 NaN 44.0 NaN 45.0 NaN 46.0 NaN 47.0 NaN 48.0 NaN 49.0 NaN 50.0 NaN 51.0 NaN 52.0 NaN 53.0 NaN 54.0 NaN 55.0 NaN 56.0 NaN 57.0 NaN 58.0 NaN 59.0 NaN 60.0 NaN 61.0 NaN 62.0 NaN 63.0 NaN 64.0 NaN 65.0 NaN 66.0 NaN 67.0 NaN 68.0 NaN 69.0 NaN 70.0 NaN 71.0 NaN 72.0 NaN 73.0 NaN 74.0 NaN 75.0 NaN 76.0 NaN 77.0 NaN 78.0 NaN 79.0 NaN Name: ID, dtype: float64
#Make into dataframe and keep column names accurate
age_buy3 = age_buy3.reset_index()
age_buy3 = age_buy3.rename(columns={'ID': 'Percentage'})
age_buy3['Percentage'].fillna(value=0, inplace=True) #Replace null with 0%
age_buy3
| DemAge | Percentage | |
|---|---|---|
| 0 | 20.0 | 0.000000 |
| 1 | 21.0 | 0.000000 |
| 2 | 22.0 | 0.181818 |
| 3 | 23.0 | 0.066667 |
| 4 | 24.0 | 0.133333 |
| 5 | 25.0 | 0.080000 |
| 6 | 26.0 | 0.066667 |
| 7 | 27.0 | 0.019608 |
| 8 | 28.0 | 0.056604 |
| 9 | 29.0 | 0.051948 |
| 10 | 30.0 | 0.063830 |
| 11 | 31.0 | 0.053571 |
| 12 | 32.0 | 0.078431 |
| 13 | 33.0 | 0.106918 |
| 14 | 34.0 | 0.067797 |
| 15 | 35.0 | 0.082126 |
| 16 | 36.0 | 0.091371 |
| 17 | 37.0 | 0.080000 |
| 18 | 38.0 | 0.082474 |
| 19 | 39.0 | 0.055556 |
| 20 | 40.0 | 0.000000 |
| 21 | 41.0 | 0.000000 |
| 22 | 42.0 | 0.000000 |
| 23 | 43.0 | 0.000000 |
| 24 | 44.0 | 0.000000 |
| 25 | 45.0 | 0.000000 |
| 26 | 46.0 | 0.000000 |
| 27 | 47.0 | 0.000000 |
| 28 | 48.0 | 0.000000 |
| 29 | 49.0 | 0.000000 |
| 30 | 50.0 | 0.000000 |
| 31 | 51.0 | 0.000000 |
| 32 | 52.0 | 0.000000 |
| 33 | 53.0 | 0.000000 |
| 34 | 54.0 | 0.000000 |
| 35 | 55.0 | 0.000000 |
| 36 | 56.0 | 0.000000 |
| 37 | 57.0 | 0.000000 |
| 38 | 58.0 | 0.000000 |
| 39 | 59.0 | 0.000000 |
| 40 | 60.0 | 0.000000 |
| 41 | 61.0 | 0.000000 |
| 42 | 62.0 | 0.000000 |
| 43 | 63.0 | 0.000000 |
| 44 | 64.0 | 0.000000 |
| 45 | 65.0 | 0.000000 |
| 46 | 66.0 | 0.000000 |
| 47 | 67.0 | 0.000000 |
| 48 | 68.0 | 0.000000 |
| 49 | 69.0 | 0.000000 |
| 50 | 70.0 | 0.000000 |
| 51 | 71.0 | 0.000000 |
| 52 | 72.0 | 0.000000 |
| 53 | 73.0 | 0.000000 |
| 54 | 74.0 | 0.000000 |
| 55 | 75.0 | 0.000000 |
| 56 | 76.0 | 0.000000 |
| 57 | 77.0 | 0.000000 |
| 58 | 78.0 | 0.000000 |
| 59 | 79.0 | 0.000000 |
#Plot the percentage of customers who brought three organic products by age
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=age_buy3,
x='DemAge', y='Percentage', linewidth=3)
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
plt.xlim(19,80) #Have the ages on the x-axis be from 19 to 80 years old
#Give the line graph a title
plt.title("Age of Customers Buying Organic Products and Percentage Buying Three Organic Products", fontsize = 20)
sns.despine() #Do not have a top and right side border
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plt.show()
#Get the percentage of customers who did and did not buy organic products by affluence grade
affl_buy = df_analysis.groupby(["DemAffl", "TargetBuy"])["ID"].count() / df_analysis.groupby(["DemAffl"])["ID"].count()
affl_buy
DemAffl TargetBuy
0.0 0 1.000000
1.0 0 1.000000
2.0 0 0.950000
1 0.050000
3.0 0 0.915572
1 0.084428
4.0 0 0.924720
1 0.075280
5.0 0 0.896510
1 0.103490
6.0 0 0.864369
1 0.135631
7.0 0 0.846600
1 0.153400
8.0 0 0.810934
1 0.189066
9.0 0 0.765867
1 0.234133
10.0 0 0.730203
1 0.269797
11.0 0 0.681981
1 0.318019
12.0 0 0.626914
1 0.373086
13.0 0 0.546729
1 0.453271
14.0 0 0.474747
1 0.525253
15.0 0 0.411911
1 0.588089
16.0 0 0.318367
1 0.681633
17.0 0 0.275862
1 0.724138
18.0 0 0.213483
1 0.786517
19.0 0 0.112676
1 0.887324
20.0 0 0.016949
1 0.983051
21.0 1 1.000000
22.0 1 1.000000
23.0 1 1.000000
24.0 1 1.000000
25.0 1 1.000000
26.0 1 1.000000
27.0 1 1.000000
28.0 1 1.000000
29.0 1 1.000000
30.0 1 1.000000
31.0 1 1.000000
34.0 1 1.000000
Name: ID, dtype: float64
#Only get the percentage of customers who did buy organic products by affluence grade,
#keep the column names accurate and make into a dataframe
affl_buy = affl_buy.reset_index()
affl_buy = affl_buy[affl_buy['TargetBuy'] == 1].drop('TargetBuy', axis=1)
affl_buy = affl_buy.rename(columns={'ID': 'Percentage'})
affl_buy
| DemAffl | Percentage | |
|---|---|---|
| 3 | 2.0 | 0.050000 |
| 5 | 3.0 | 0.084428 |
| 7 | 4.0 | 0.075280 |
| 9 | 5.0 | 0.103490 |
| 11 | 6.0 | 0.135631 |
| 13 | 7.0 | 0.153400 |
| 15 | 8.0 | 0.189066 |
| 17 | 9.0 | 0.234133 |
| 19 | 10.0 | 0.269797 |
| 21 | 11.0 | 0.318019 |
| 23 | 12.0 | 0.373086 |
| 25 | 13.0 | 0.453271 |
| 27 | 14.0 | 0.525253 |
| 29 | 15.0 | 0.588089 |
| 31 | 16.0 | 0.681633 |
| 33 | 17.0 | 0.724138 |
| 35 | 18.0 | 0.786517 |
| 37 | 19.0 | 0.887324 |
| 39 | 20.0 | 0.983051 |
| 40 | 21.0 | 1.000000 |
| 41 | 22.0 | 1.000000 |
| 42 | 23.0 | 1.000000 |
| 43 | 24.0 | 1.000000 |
| 44 | 25.0 | 1.000000 |
| 45 | 26.0 | 1.000000 |
| 46 | 27.0 | 1.000000 |
| 47 | 28.0 | 1.000000 |
| 48 | 29.0 | 1.000000 |
| 49 | 30.0 | 1.000000 |
| 50 | 31.0 | 1.000000 |
| 51 | 34.0 | 1.000000 |
#Plot the percentage of customers who did buy organic products by affluence grade
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_buy,
x='DemAffl', y='Percentage', linewidth=3)
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the line graph a title
plt.title("Affluence Grade of Customers and Percentage Buying Organic Products", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plt.show()
#Get the percentage of customers who brought two or more organic products by affluence grade
affl_buy2 = df_analysis[df_analysis["TargetAmt"] >= 2].groupby(["DemAffl"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["DemAffl"])["ID"].count()
affl_buy2
DemAffl 2.0 NaN 3.0 NaN 4.0 NaN 5.0 NaN 6.0 0.009901 7.0 0.017632 8.0 0.018072 9.0 0.049587 10.0 0.071672 11.0 0.108818 12.0 0.125270 13.0 0.208763 14.0 0.259615 15.0 0.337553 16.0 0.335329 17.0 0.600000 18.0 0.642857 19.0 0.825397 20.0 0.862069 21.0 0.861111 22.0 0.920000 23.0 1.000000 24.0 1.000000 25.0 1.000000 26.0 1.000000 27.0 1.000000 28.0 1.000000 29.0 1.000000 30.0 1.000000 31.0 1.000000 34.0 1.000000 Name: ID, dtype: float64
#Make into dataframe and keep column names accurate
affl_buy2 = affl_buy2.reset_index()
affl_buy2 = affl_buy2.rename(columns={'ID': 'Percentage'}) #Replace null with 0%
affl_buy2['Percentage'].fillna(value=0, inplace=True)
affl_buy2
| DemAffl | Percentage | |
|---|---|---|
| 0 | 2.0 | 0.000000 |
| 1 | 3.0 | 0.000000 |
| 2 | 4.0 | 0.000000 |
| 3 | 5.0 | 0.000000 |
| 4 | 6.0 | 0.009901 |
| 5 | 7.0 | 0.017632 |
| 6 | 8.0 | 0.018072 |
| 7 | 9.0 | 0.049587 |
| 8 | 10.0 | 0.071672 |
| 9 | 11.0 | 0.108818 |
| 10 | 12.0 | 0.125270 |
| 11 | 13.0 | 0.208763 |
| 12 | 14.0 | 0.259615 |
| 13 | 15.0 | 0.337553 |
| 14 | 16.0 | 0.335329 |
| 15 | 17.0 | 0.600000 |
| 16 | 18.0 | 0.642857 |
| 17 | 19.0 | 0.825397 |
| 18 | 20.0 | 0.862069 |
| 19 | 21.0 | 0.861111 |
| 20 | 22.0 | 0.920000 |
| 21 | 23.0 | 1.000000 |
| 22 | 24.0 | 1.000000 |
| 23 | 25.0 | 1.000000 |
| 24 | 26.0 | 1.000000 |
| 25 | 27.0 | 1.000000 |
| 26 | 28.0 | 1.000000 |
| 27 | 29.0 | 1.000000 |
| 28 | 30.0 | 1.000000 |
| 29 | 31.0 | 1.000000 |
| 30 | 34.0 | 1.000000 |
#Plot the percentage of customers who brought two or more organic products by affluence grade
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_buy2,
x='DemAffl', y='Percentage', linewidth=3)
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the line graph a title
plt.title("Affluence Grade of Customers Buying Organic Products and Percentage Buying Two or More Organic Products", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plt.show()
#Get the percentage of customers who brought three organic products by age
affl_buy3 = df_analysis[df_analysis["TargetAmt"] >= 3].groupby(["DemAffl"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["DemAffl"])["ID"].count()
affl_buy3
DemAffl 2.0 NaN 3.0 NaN 4.0 NaN 5.0 NaN 6.0 NaN 7.0 NaN 8.0 NaN 9.0 NaN 10.0 NaN 11.0 0.003752 12.0 0.010799 13.0 0.012887 14.0 0.025641 15.0 0.046414 16.0 0.029940 17.0 0.085714 18.0 0.128571 19.0 0.269841 20.0 0.327586 21.0 0.333333 22.0 0.440000 23.0 0.411765 24.0 0.266667 25.0 1.000000 26.0 0.500000 27.0 1.000000 28.0 1.000000 29.0 1.000000 30.0 1.000000 31.0 1.000000 34.0 1.000000 Name: ID, dtype: float64
#Make into dataframe and keep column names accurate
affl_buy3 = affl_buy3.reset_index()
affl_buy3 = affl_buy3.rename(columns={'ID': 'Percentage'})
affl_buy3['Percentage'].fillna(value=0, inplace=True) #Replace null with 0%
affl_buy3
| DemAffl | Percentage | |
|---|---|---|
| 0 | 2.0 | 0.000000 |
| 1 | 3.0 | 0.000000 |
| 2 | 4.0 | 0.000000 |
| 3 | 5.0 | 0.000000 |
| 4 | 6.0 | 0.000000 |
| 5 | 7.0 | 0.000000 |
| 6 | 8.0 | 0.000000 |
| 7 | 9.0 | 0.000000 |
| 8 | 10.0 | 0.000000 |
| 9 | 11.0 | 0.003752 |
| 10 | 12.0 | 0.010799 |
| 11 | 13.0 | 0.012887 |
| 12 | 14.0 | 0.025641 |
| 13 | 15.0 | 0.046414 |
| 14 | 16.0 | 0.029940 |
| 15 | 17.0 | 0.085714 |
| 16 | 18.0 | 0.128571 |
| 17 | 19.0 | 0.269841 |
| 18 | 20.0 | 0.327586 |
| 19 | 21.0 | 0.333333 |
| 20 | 22.0 | 0.440000 |
| 21 | 23.0 | 0.411765 |
| 22 | 24.0 | 0.266667 |
| 23 | 25.0 | 1.000000 |
| 24 | 26.0 | 0.500000 |
| 25 | 27.0 | 1.000000 |
| 26 | 28.0 | 1.000000 |
| 27 | 29.0 | 1.000000 |
| 28 | 30.0 | 1.000000 |
| 29 | 31.0 | 1.000000 |
| 30 | 34.0 | 1.000000 |
#Plot the percentage of customers who brought three organic products by affluence grade
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_buy3,
x='DemAffl', y='Percentage', linewidth=3)
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the line graph a title
plt.title("Affluence Grade of Customers Buying Organic Products and Percentage Buying Three Organic Products", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plt.show()
#Get average amount spent by affluence grade
affl_spend = df_analysis.groupby('DemAffl')['PromSpend'].mean().reset_index()
affl_spend
| DemAffl | PromSpend | |
|---|---|---|
| 0 | 0.0 | 4217.228333 |
| 1 | 1.0 | 4106.715833 |
| 2 | 2.0 | 4131.813700 |
| 3 | 3.0 | 4833.379231 |
| 4 | 4.0 | 4163.785941 |
| 5 | 5.0 | 4719.320463 |
| 6 | 6.0 | 4487.887167 |
| 7 | 7.0 | 4598.021167 |
| 8 | 8.0 | 4268.491409 |
| 9 | 9.0 | 4437.048189 |
| 10 | 10.0 | 4774.821013 |
| 11 | 11.0 | 4533.221068 |
| 12 | 12.0 | 4367.005302 |
| 13 | 13.0 | 4085.777278 |
| 14 | 14.0 | 4256.772559 |
| 15 | 15.0 | 5351.760372 |
| 16 | 16.0 | 3308.645592 |
| 17 | 17.0 | 2286.102138 |
| 18 | 18.0 | 3223.334157 |
| 19 | 19.0 | 3488.155070 |
| 20 | 20.0 | 3034.970508 |
| 21 | 21.0 | 3324.370833 |
| 22 | 22.0 | 2034.605200 |
| 23 | 23.0 | 3500.861176 |
| 24 | 24.0 | 4068.178667 |
| 25 | 25.0 | 2167.387692 |
| 26 | 26.0 | 1258.771667 |
| 27 | 27.0 | 879.598000 |
| 28 | 28.0 | 0.010000 |
| 29 | 29.0 | 6166.666667 |
| 30 | 30.0 | 1453.333333 |
| 31 | 31.0 | 300.000000 |
| 32 | 34.0 | 400.000000 |
#Get the average, minimum, and maximum amount spent by affluence grade
#and the number of customers in each affluence grade
affl_info = df_analysis.groupby('DemAffl')['PromSpend'].agg(['mean', 'min', 'max', 'count']).reset_index()
affl_info
| DemAffl | mean | min | max | count | |
|---|---|---|---|---|---|
| 0 | 0.0 | 4217.228333 | 0.01 | 12000.00 | 6 |
| 1 | 1.0 | 4106.715833 | 0.01 | 18000.00 | 36 |
| 2 | 2.0 | 4131.813700 | 0.01 | 24053.35 | 200 |
| 3 | 3.0 | 4833.379231 | 0.01 | 65053.35 | 533 |
| 4 | 4.0 | 4163.785941 | 0.01 | 65000.00 | 983 |
| 5 | 5.0 | 4719.320463 | 0.01 | 94288.31 | 1662 |
| 6 | 6.0 | 4487.887167 | 0.01 | 97000.00 | 2234 |
| 7 | 7.0 | 4598.021167 | 0.01 | 81551.03 | 2588 |
| 8 | 8.0 | 4268.491409 | 0.01 | 100000.00 | 2634 |
| 9 | 9.0 | 4437.048189 | 0.01 | 110072.44 | 2584 |
| 10 | 10.0 | 4774.821013 | 0.01 | 296313.85 | 2172 |
| 11 | 11.0 | 4533.221068 | 0.01 | 78292.50 | 1676 |
| 12 | 12.0 | 4367.005302 | 0.01 | 95000.00 | 1241 |
| 13 | 13.0 | 4085.777278 | 0.01 | 120000.00 | 856 |
| 14 | 14.0 | 4256.772559 | 0.01 | 100000.00 | 594 |
| 15 | 15.0 | 5351.760372 | 0.01 | 239542.13 | 403 |
| 16 | 16.0 | 3308.645592 | 0.01 | 48301.78 | 245 |
| 17 | 17.0 | 2286.102138 | 0.01 | 30937.60 | 145 |
| 18 | 18.0 | 3223.334157 | 0.01 | 37000.00 | 89 |
| 19 | 19.0 | 3488.155070 | 0.01 | 47000.00 | 71 |
| 20 | 20.0 | 3034.970508 | 0.01 | 24106.70 | 59 |
| 21 | 21.0 | 3324.370833 | 0.01 | 29673.44 | 36 |
| 22 | 22.0 | 2034.605200 | 0.01 | 12000.00 | 25 |
| 23 | 23.0 | 3500.861176 | 0.01 | 17553.35 | 17 |
| 24 | 24.0 | 4068.178667 | 0.01 | 11500.00 | 15 |
| 25 | 25.0 | 2167.387692 | 0.01 | 6000.00 | 13 |
| 26 | 26.0 | 1258.771667 | 0.01 | 6000.00 | 6 |
| 27 | 27.0 | 879.598000 | 0.01 | 3397.96 | 5 |
| 28 | 28.0 | 0.010000 | 0.01 | 0.01 | 1 |
| 29 | 29.0 | 6166.666667 | 2000.00 | 13500.00 | 3 |
| 30 | 30.0 | 1453.333333 | 360.00 | 3000.00 | 3 |
| 31 | 31.0 | 300.000000 | 300.00 | 300.00 | 2 |
| 32 | 34.0 | 400.000000 | 400.00 | 400.00 | 1 |
#Get the average, minimum, and maximum amount time spent in store by affluence grade
affl_time = df_analysis.groupby('DemAffl')['PromTime'].agg(['mean', 'min', 'max']).reset_index()
affl_time
| DemAffl | mean | min | max | |
|---|---|---|---|---|
| 0 | 0.0 | 6.166667 | 4.0 | 9.0 |
| 1 | 1.0 | 7.527778 | 2.0 | 31.0 |
| 2 | 2.0 | 7.191919 | 0.0 | 32.0 |
| 3 | 3.0 | 6.717514 | 0.0 | 32.0 |
| 4 | 4.0 | 6.662910 | 0.0 | 33.0 |
| 5 | 5.0 | 6.557927 | 0.0 | 33.0 |
| 6 | 6.0 | 6.677083 | 0.0 | 38.0 |
| 7 | 7.0 | 6.574794 | 0.0 | 32.0 |
| 8 | 8.0 | 6.576819 | 0.0 | 36.0 |
| 9 | 9.0 | 6.543418 | 0.0 | 35.0 |
| 10 | 10.0 | 6.522548 | 0.0 | 35.0 |
| 11 | 11.0 | 6.601567 | 0.0 | 33.0 |
| 12 | 12.0 | 6.465798 | 0.0 | 39.0 |
| 13 | 13.0 | 6.284371 | 1.0 | 36.0 |
| 14 | 14.0 | 6.432479 | 1.0 | 32.0 |
| 15 | 15.0 | 6.275949 | 0.0 | 32.0 |
| 16 | 16.0 | 6.207469 | 0.0 | 25.0 |
| 17 | 17.0 | 6.331034 | 1.0 | 27.0 |
| 18 | 18.0 | 6.551724 | 1.0 | 27.0 |
| 19 | 19.0 | 5.169014 | 1.0 | 12.0 |
| 20 | 20.0 | 5.719298 | 1.0 | 17.0 |
| 21 | 21.0 | 5.500000 | 1.0 | 10.0 |
| 22 | 22.0 | 5.680000 | 1.0 | 12.0 |
| 23 | 23.0 | 4.625000 | 1.0 | 16.0 |
| 24 | 24.0 | 4.785714 | 1.0 | 10.0 |
| 25 | 25.0 | 5.153846 | 1.0 | 11.0 |
| 26 | 26.0 | 5.666667 | 3.0 | 13.0 |
| 27 | 27.0 | 5.200000 | 3.0 | 10.0 |
| 28 | 28.0 | 8.000000 | 8.0 | 8.0 |
| 29 | 29.0 | 4.333333 | 2.0 | 9.0 |
| 30 | 30.0 | 4.500000 | 2.0 | 7.0 |
| 31 | 31.0 | 9.500000 | 8.0 | 11.0 |
| 32 | 34.0 | 9.000000 | 9.0 | 9.0 |
#Plot the average amount of money customers spent in the store this year
#by affluence grade
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_spend,
x='DemAffl', y='PromSpend', linewidth=3)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Give the line graph a title
plt.title("Affluence Grade of Customers and Average Amount Spent in Store this Year", fontsize = 20)
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plot.yaxis.set_major_formatter('${x:1.0f}') #Make the y-axis into dollars
plt.show()
#Plot the average amount of money customers spent in the store this year
#by affluence grades with ten or more customers
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_info[affl_info['count'] >= 10], #Only include affluence grade with ten or more customers
x='DemAffl', y='mean', linewidth=3)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Give the line graph a title
plt.title("Affluence Grade of Customers and Average Amount Spent in Store this Year", fontsize = 20)
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plot.yaxis.set_major_formatter('${x:1.0f}') #Make the y-axis into dollars
plt.show()
#Plot the average amount of money customers spent in the store this year
#by affluence grades with twenty or more customers
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_info[affl_info['count'] >= 20], #Only include affluence grade with twenty or more customers
x='DemAffl', y='mean', linewidth=3)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
plt.ylim(0,5500) #Have the y-axis be from $0 to $5,500
#Give the line graph a title
plt.title("Affluence Grade of Customers and Average Amount Spent in Store this Year", fontsize = 20)
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plot.yaxis.set_major_formatter('${x:1.0f}') #Make the y-axis into dollars
plt.show()
affl_age = df_analysis.groupby('DemAffl')['DemAge'].agg(['mean', 'min', 'max']).reset_index()
affl_age
| DemAffl | mean | min | max | |
|---|---|---|---|---|
| 0 | 0.0 | 57.600000 | 41.0 | 72.0 |
| 1 | 1.0 | 57.593750 | 37.0 | 77.0 |
| 2 | 2.0 | 56.075269 | 25.0 | 79.0 |
| 3 | 3.0 | 55.392079 | 18.0 | 79.0 |
| 4 | 4.0 | 56.039871 | 19.0 | 79.0 |
| 5 | 5.0 | 55.145631 | 19.0 | 79.0 |
| 6 | 6.0 | 55.127946 | 18.0 | 79.0 |
| 7 | 7.0 | 54.663633 | 18.0 | 79.0 |
| 8 | 8.0 | 54.015789 | 19.0 | 79.0 |
| 9 | 9.0 | 53.894737 | 19.0 | 79.0 |
| 10 | 10.0 | 53.967996 | 18.0 | 79.0 |
| 11 | 11.0 | 53.377007 | 20.0 | 79.0 |
| 12 | 12.0 | 53.117749 | 22.0 | 79.0 |
| 13 | 13.0 | 51.778481 | 18.0 | 79.0 |
| 14 | 14.0 | 51.514493 | 21.0 | 79.0 |
| 15 | 15.0 | 50.501319 | 22.0 | 78.0 |
| 16 | 16.0 | 48.914163 | 25.0 | 77.0 |
| 17 | 17.0 | 47.462121 | 22.0 | 76.0 |
| 18 | 18.0 | 46.953488 | 24.0 | 79.0 |
| 19 | 19.0 | 43.936508 | 26.0 | 74.0 |
| 20 | 20.0 | 41.545455 | 22.0 | 77.0 |
| 21 | 21.0 | 41.242424 | 29.0 | 59.0 |
| 22 | 22.0 | 40.478261 | 28.0 | 73.0 |
| 23 | 23.0 | 38.176471 | 22.0 | 58.0 |
| 24 | 24.0 | 44.133333 | 27.0 | 59.0 |
| 25 | 25.0 | 34.166667 | 25.0 | 38.0 |
| 26 | 26.0 | 41.400000 | 30.0 | 55.0 |
| 27 | 27.0 | 35.400000 | 30.0 | 38.0 |
| 28 | 28.0 | 33.000000 | 33.0 | 33.0 |
| 29 | 29.0 | 32.666667 | 29.0 | 35.0 |
| 30 | 30.0 | 37.666667 | 36.0 | 39.0 |
| 31 | 31.0 | 36.500000 | 35.0 | 38.0 |
| 32 | 34.0 | 38.000000 | 38.0 | 38.0 |
#Plot the average age by affluence grade
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_age,
x='DemAffl', y='mean', linewidth=3)
#Give the line graph a title
plt.title("Affluence Grade of Customers and Average Age", fontsize = 20)
plt.ylim(0,60) #Have the age on the y-axis be from 0 to 60 years old
#Change the font size of the titles for the x and y-axis
plt.xlabel("Affluence Grade", fontsize=11)
plt.ylabel("Average Age", fontsize=11)
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plt.show()
#Get the average amount spent by age
age_spend = df_analysis.groupby('DemAge')['PromSpend'].mean().reset_index()
age_spend
| DemAge | PromSpend | |
|---|---|---|
| 0 | 18.0 | 2495.000000 |
| 1 | 19.0 | 1929.041000 |
| 2 | 20.0 | 3821.196000 |
| 3 | 21.0 | 2937.500000 |
| 4 | 22.0 | 2988.736111 |
| ... | ... | ... |
| 57 | 75.0 | 7652.286906 |
| 58 | 76.0 | 8263.677285 |
| 59 | 77.0 | 7668.657259 |
| 60 | 78.0 | 7139.649268 |
| 61 | 79.0 | 7203.240000 |
62 rows × 2 columns
#Plot the average amount spent this year by age
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=age_spend,
x='DemAge', y='PromSpend', linewidth=3)
#Give the line graph a title
plt.title("Average Amount Spent this Year by Age", fontsize = 20)
plt.ylim(0,9000) #Have the y-axis be from $0 to $9,000
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plot.yaxis.set_major_formatter('${x:1.0f}') #Make the y-axis into dollars
plt.show()
#Plot the number of customers by affluence grade
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_info,
x='DemAffl', y='count', linewidth=3)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Give the line graph a title
plt.title("Number of Customers by Affluence Grade", fontsize = 20)
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plt.show()
#Get the number of customers by age
age_count = df_analysis.groupby('DemAge')['ID'].count().reset_index().rename({'ID': 'Customers'}, axis=1)
age_count
| DemAge | Customers | |
|---|---|---|
| 0 | 18.0 | 5 |
| 1 | 19.0 | 10 |
| 2 | 20.0 | 10 |
| 3 | 21.0 | 8 |
| 4 | 22.0 | 18 |
| ... | ... | ... |
| 57 | 75.0 | 265 |
| 58 | 76.0 | 302 |
| 59 | 77.0 | 270 |
| 60 | 78.0 | 205 |
| 61 | 79.0 | 149 |
62 rows × 2 columns
#Plot the number of customers by age
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=age_count,
x='DemAge', y='Customers', linewidth=3)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Give the line graph a title
plt.title("Number of Customers by Age", fontsize = 20)
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plt.show()
#Plot average spent by affluence
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_spend,
x='DemAffl', y='PromSpend', linewidth=3)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Give the line graph a title
plt.title("Average Spent By Affluence Grade", fontsize = 20)
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plt.show()
#Get the number of customers that purchased one, two, or three organic products
amt_purchased = df_analysis.groupby('TargetAmt')['ID'].count().reset_index()
amt_purchased = amt_purchased.rename(columns={'ID': 'Customers'}) #Keep the column name accurate
amt_purchased.drop(0, inplace=True) #Drop the number of customers who brought no organic products
amt_purchased
| TargetAmt | Customers | |
|---|---|---|
| 1 | 1 | 4625 |
| 2 | 2 | 715 |
| 3 | 3 | 165 |
#Plot the number of customers who brought one, two, or three organic products
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=amt_purchased, #Make the number of customers who brought one organic product blue and the others grey to point out how most customers brought one organic product
x='TargetAmt', y='Customers', palette=['blue' if x == 1 else 'lightgrey' for x in amt_purchased['TargetAmt']])
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)
#Give the column chart a title
plt.title("Number of Customers by Amount of Organic Products Purchased", fontsize = 20)
sns.despine() #Do not have a top and right side border
# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the number of customers who brought that
#number of organic products
for p in plot.patches:
# get the height of each bar
height = p.get_height()
# adding text to each bar
plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
y = height+100, # y-coordinate position of data label, padded 100 above bar
s = '{:.0f}'.format(height), # data label format for the number of customers
fontsize = 14,
ha = 'center') # sets horizontal alignment (ha) to center
plt.show()
%%shell
jupyter nbconvert --to html "/content/drive/MyDrive/Colab Notebooks/Untitled5.ipynb"